Python MyConnection Examples

Programming Language: Python

Namespace/Package Name: pipeline.db_upload

Class/Type: MyConnection

Examples at hotexamples.com: 10

Python MyConnection - 10 examples found. These are the top rated real world Python examples of pipeline.db_upload.MyConnection extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MyConnection(2)

execute_fetch_select(1)

execute_no_fetch(1)

Example #1

Show file

File: vamps.py Project: MBL-Woods-Hole/py_mbl_sequencing_pipeline

 def info(self, lane_keys):
     """
     fill vamps_project_info table
     """
     logger.info("Starting vamps_upload: projects_info")
     
     if self.runobj.site == 'vamps':
         db_host    = 'vampsdb'
         db_name    = 'vamps'
     else:
         db_host    = 'vampsdev'
         db_name    = 'vamps'
     myconn = MyConnection(host=db_host, db=db_name)
     query = "SELECT last_name,first_name,email,institution from vamps_auth where user='******'" % (self.runobj.user)
     data = myconn.execute_fetch_select(query)
     
     fh = open(self.projects_info_file,'w')
      
     title="title"
     description='description'
     contact= data[0][1]+' '+data[0][0]
     email= data[0][2]
     institution= data[0][3]
     user = self.runobj.user
     fh.write("\t".join(["HEADER","project","title","description","contact", "email","institution","user","env_source_id"] )+"\n")
     fh.write("\t".join(["0",self.project, title, description, contact, email, institution, user, self.runobj.env_source_id] )+"\n")
     # if this project already exists in the db???
     # the next step should update the table rather than add new to the db
     
     fh.close()
     logger.info("Finishing VAMPS info()")

Example #2

Show file

File: metadata.py Project: avoorhis/py_mbl_sequencing_pipeline

 def env_source_to_id(self, headers):
     logger.error("self.utils.is_local() LLL2 metadata")
     logger.error(self.utils.is_local())
     if self.utils.is_local():
         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
     else:
         self.my_conn = MyConnection(host='bpcdb1', db="env454")
     # self.my_conn     = MyConnection()
     my_sql       = """SELECT * FROM env_sample_source"""
     self.env     = self.my_conn.execute_fetch_select(my_sql)
     self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]

Example #3

Show file

File: metadata.py Project: avoorhis/py_mbl_sequencing_pipeline

    def get_my_conn(self):
        try:
            host = self.general_config_dict['database_host']
        except:
            raise
        try:
            db = self.general_config_dict['database_name']
        except:
            raise
        if self.utils.is_local():
            host = 'localhost'
            db   = "test_env454"

        self.my_conn = MyConnection(host = host, db = db)

Example #4

Show file

    def check_projects_and_datasets(self,data):
        self.my_conn     = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()      
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print p 
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")
                
            ds_found_count = 0   
            for d in datasets:
                if datasets[d] == p:
                    
                    #print "\t%s" % (d)
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)

Example #5

Show file

File: vamps_gast.py Project: icefoxx/py_mbl_sequencing_pipeline

def start_gast(myobject):
    """
      Doc string
    """
    project     = myobject['project']
    dataset     = myobject['dataset']
    dna_region  = myobject['dna_region']
    domain      = myobject['domain']
    runcode     = myobject['runcode']
    site        = myobject['site']
    #user_cursor = myobject['user_cursor']
    datetime    = myobject['datetime']
    user        = myobject['user']
    from_fasta  = myobject['from_fasta']
    load_db     = myobject['load_db']
    env_source_id   = myobject['env_source_id'] 
    steps   = myobject['steps'] 
    fasta_file_from_cl  = myobject['fasta_file']
    use_cluster         = myobject['use_cluster']
    #myobject['baseoutputdir']
    seq_count   = 0
    site_base   = '/xraid2-2/vampsweb/'+site
    file_prefix = user+runcode
    
    output_dir = myobject['output_dir']
    #output_dir  = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast')
    
    # use the files from file_base directory
    # but we get the primers and keys from the database
    # which were stored there during the loading phase      

    # check for directory:  user_runcode
    #    if present use the data from there
    #    if not: go to the database
    if os.path.exists(output_dir):
        print "files path exists:",output_dir
        #gast_input_source = 'files'
        #file_base = output_dir
        # This may be a mobedac upload and we should try to use the files here
        # rather than look to the database for data

    else:
        output_dir  = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast')
        print "Files path doesn't exist: attempting to get data from database"
        print "Creating directory",output_dir
        os.mkdir(output_dir)
        
    from pipeline.run import Run
    from pipelineprocessor import process
    myRunDict = {}
    # this is a minimal run dictionary for the general stanza
    myRunDict['general'] = {'run_date':datetime,                'vamps_user_upload':True, 
                            'gast_input_source':'database',     'input_file_names':'vamps_upload', 
                            'input_file_lanes':'1',             'input_file_formats':'fasta',
                            'run':runcode,                      'use_cluster':use_cluster, 
                            'platform':'vamps', 
                            'user':user,                        'site':site,
                            'load_vamps_database':True,
                            'input_files':None,                 'files_list':[],
                            'output_dir':output_dir,            'file_prefix':file_prefix}
    #print myRunDict
    #
    #
    #
    run = Run(myRunDict, "/xraid2-2/vampsweb/"+site)
    #
    #
    #
    # pack the things we'll need for GAST
    run.project = project
    run.dataset = dataset
    run.load_db = load_db
    run.env_source_id=env_source_id
    run.site = site
    run.from_fasta = from_fasta
    run.fasta_file_from_cl=fasta_file_from_cl
    run.runcode = runcode
    run.user = user
    run.samples = {}
    run.dna_region = dna_region
    
    #run.basedir = file_base
#    fastaunique_cmd = '/bioware/bin/fastaunique'
    fastaunique_cmd = 'fastaunique'
    if run.from_fasta:
        print run.from_fasta
        # copy file to
        fasta_file = os.path.join(output_dir,run.user+run.runcode+'.fa')
        shutil.copyfile(run.fasta_file_from_cl, fasta_file)
        grep_cmd = ['grep','-c','>',fasta_file]
        run.dataset_count = subprocess.check_output(grep_cmd).strip()
        
    else:
        # from database
        from pipeline.db_upload import MyConnection
        if site == 'vamps':
            db_host_user    = '******'
            db_name_user    = '******'
        else:
            db_host_user    = '******'
            db_name_user    = '******'
        myconn = MyConnection(host=db_host_user,db=db_name_user)
        
        
        # should create the fasta file and names file here and not in gast.py 
        ds_list = []
        if dataset:
            ds_list = [dataset]
            query ="select read_id,sequence,dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+dataset+"' and user='******' "    
            print query
            rows = myconn.execute_fetch_select(query)
            fasta_file = os.path.join(output_dir, 'fasta.fa')
            unique_file = os.path.join(output_dir, 'unique.fa')
            names_file = os.path.join(output_dir, 'names')
    
            fh = open(fasta_file, 'w')
            
            if not rows:
                print "No data found using query:", query
                
            for r in rows:
                id  = r[0]
                seq = r[1]                   
                fh.write(">"+id+"\n"+seq+"\n")
                
            fh.close()
            fastaunique_cmd = fastaunique_cmd +" -x -i "+fasta_file+" -o "+unique_file+" -n "+names_file 
            subprocess.call(fastaunique_cmd, shell=True)
        
        else:
            # looks for vamps_projects_datasets_pipe in vamps_user_uploads
           
            q0 = "select distinct dataset from vamps_projects_datasets_pipe where project='"+project+"' and dataset != '' and dataset != 'NoKey'"
            print q0
            dsrows = myconn.execute_fetch_select(q0)
            if not dsrows:
                print "No datasets found using query:", q0
                sys.exit()
            for ds in dsrows:                
                ds  = ds[0]
                ds_list.append(ds)
                
                
                query ="select read_id, sequence, dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+ds+"' and user='******' "    
                print query
                rows = myconn.execute_fetch_select(query)
                
                
                ds_dir = os.path.join(output_dir, ds)
                if os.path.exists(ds_dir):
                    # Start with and empty directory
                    shutil.rmtree(ds_dir, True)
                    os.mkdir(ds_dir)
                else:
                    os.mkdir(ds_dir)
                fasta_file = os.path.join(output_dir, ds,  'fasta.fa')
                unique_file = os.path.join(output_dir, ds, 'unique.fa')
                names_file = os.path.join(output_dir,  ds, 'names')
                #dataset_file=os.path.join(output_dir,   'datasets')
                fh = open(fasta_file, 'w')
                
                if not rows:
                    print "No data found using query:", query
                    
                for r in rows:
                    id  = r[0]
                    seq = r[1]  
                    ds  = r[2]
                    fh.write(">"+id+"\n"+seq+"\n")
                    
                fh.close()
                
            
                fastaunique_call = fastaunique_cmd +" "+fasta_file+" -o "+unique_file+" -n "+names_file + " -f"
            
                subprocess.call(fastaunique_call, shell=True)
    run.datasets = ds_list
    
    
    ###############################################################
    # This starts the MBL GAST python pipeline at the GAST STEP
    #
    # now do all the work
    # possible steps: trim,chimera,gast,vampsupload
    
    process(run, steps)
    print "done with gast"

Example #6

Show file

    def env_source_to_id(self, headers):
        self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454")  
#         self.my_conn     = MyConnection()    
        my_sql       = """SELECT * FROM env_sample_source"""
        self.env     = self.my_conn.execute_fetch_select(my_sql)
        self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]

Example #7

Show file

class MetadataUtils:
    """
    Class to read metadata files (csv and ini style)
    validate and create a dictionary from them
    Two parts: 
    1) From pipeline-ui.py to validate the input args
    2) From runconfig.py to write the final ini file and create the dictionary
    that is used to create the run object
    """
    Name = "MetadataUtils"
    def __init__(self, command_line_args = None, configuration_dictionary = None):
        self.args = command_line_args
        self.general_config_dict = configuration_dictionary
        self.known_header_list  = C.csv_header_list
        self.pipeline_run_items = C.pipeline_run_items
        self.primer_suites      = self.convert_primer_suites(C.primer_suites) 
        self.dna_regions        = C.dna_regions
        self.data_object = {}
        self.data_object['general'] = {}
        self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
        then press 'c' to continue the pipeline\n"""
        self.res_headers = []
        self.env = {}
                  
    def convert_and_save_ini(self, analysis_dir):
        
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini')
        # converts csv to ini and saves to output_dir
        if self.general_config_dict['platform'] == 'vamps':
            self.save_ini_file(new_ini_file)
        else:
            self.convert_csv_to_ini(new_ini_file)
        'TODO: Andy, what mean the next two lines?'
#        self.general_config_dict['configPath']
#        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file
        
        # change path and type to new ini
        # regardless of what they were before        
    
    
    
    def validate(self, analysis_dir): 
        
        if self.general_config_dict['platform'] == 'illumina':
            self.warn_msg = self.validate_illumina_ini(analysis_dir)
        elif self.general_config_dict['platform'] == '454':
            data = self.validate_454_ini(analysis_dir)
        elif self.general_config_dict['platform'] == 'ion_torrent':
            pass
        elif self.general_config_dict['platform'] == 'vamps':
            data = self.validate_vamps_ini(analysis_dir)
        else:
            sys.exit("Unknown platform and configFile type for validation")
            

        return self.data_object
            
    def get_general_data(self):
        """
        """
        return self.data_object['general']
        
#     def create_dictionary_from_ini(self):
#         """
#         # read an ini config file and convert to a dictionary
#         """
#         import ConfigParser
#         if os.path.exists(self.general_config_dict['configPath']):
#             data_object = {}
#             user_config = ConfigParser.ConfigParser()
#             user_config.read(self.general_config_dict['configPath'])
#             
#             for section in user_config.sections():
#                 
#                 section_dict = data_object[section] = {}
#                 for option in user_config.options(section):
#                     section_dict[option] = user_config.get(section,option)
#                     
#         else:
#             print "error could not open config file: ",self.general_config_dict['configPath']
#         
#         return data_object 

#     def get_command_line_items(self, general_data):
#     
#         # command line items take precedence over ini file items of the same name
#         # defaults should be here and NOT in argparse/commandline
#         if self.args.input_dir:       
#             general_data['input_dir'] = self.args.input_dir
#         else:
#             if not general_data['input_dir']:
#                 general_data['input_dir'] = './'
#         
#         if self.args.run:
#             general_data['run'] = self.args.run
#             general_data['run_date'] = self.args.run
#         else:
#             if 'run' in general_data:                
#                 general_data['run_date'] = general_data['run']
#             elif 'run_date' in general_data:
#                 general_data['run'] = general_data['run_date']
#             else:
#                 sys.exit("Cannot find the run or run_date on command line or in config file - Exiting")
#         # make sure RUN is before OUTPUT_DIR        
#         try:
#             general_data['output_dir'] = os.path.join(self.args.baseoutputdir,self.args.run)
#         except:
#             if 'output_dir' not in general_data:
#                 general_data['output_dir'] = os.path.join('.',self.args.run)       
#         #getattr(args,'force_runkey', "")
#         
#         
#         if self.args.platform:
#             general_data['platform'] = self.args.platform
#         else:
#             if 'platform' not in general_data:
#                 sys.exit("Cannot find the platform from command line or in config file - Exiting")
#                 
#         
#         if self.args.input_file_format:
#             general_data['input_file_format'] = self.args.input_file_format
#         else:
#             if 'input_file_format' not in general_data:
#                 general_data['input_file_format'] = ''
#         if self.args.input_file_suffix:
#             general_data['input_file_suffix'] = self.args.input_file_suffix
#         else:
#             if 'input_file_suffix' not in general_data:
#                 general_data['input_file_suffix'] = ''
#         
#         return general_data
        
#     def validate_454_csv(self, args, my_csv):
#         print "TODO: write validate def for 454/csv"
#         data_object = self.populate_data_object_454(args, my_csv)
        
    def validate_vamps_ini(self, analysis_dir):
        # configPath is the new configPath
        'todo: Andy, what should be here, just directory name or directory + number.ini?'
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])
        if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] )
        elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']): 
            sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] )
                          
    def validate_454_ini(self, analysis_dir):
        print "TODO - write validation def for 454/ini"
        #self.data_object = self.create_dictionary_from_ini() 
        # 454 ini file requirements:
        
        
        
    def validate_illumina_ini(self, analysis_dir):
        """
        The csv headers are checked earlier
        """
        
        print "Validating ini type Config File (may have been converted from csv)"
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        print "New ini file location: "+new_ini_file
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print 'configpath',self.general_config_dict['configPath']
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])

        
        (error_code,warn_code) = self.check_for_missing_values(self.data_object)  
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_dataset_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print self.data_object['input_dir']
        #print self.data_object['input_files']
 
 
        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")        
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina':
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True
                        
        if error:
            sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING 
            PLEASE CORRECT THEM AND START OVER.\033[0m\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn: 
            msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
            print "\033[92mCSV File Passed Vaidation! (with warnings)\033[0m"
        else:
            print "\033[92mCSV File Passed Vaidation!\033[0m"
        return msg
        
    def validate_dictionary(self, config_info):
        """
        This is only used for data that comes in as a dictionary rather than a file
        such as with vamps user uploads
        """
        print "TODO - Validating input dictionary"
        # must be a general section
        # should I create a dict here??? -That would render much code in
        #    runconfig useless.
        # are we going to continue developing ini style config files if
        #   no one uses them?  
        configDict = config_info

        return configDict   
        


        
    def populate_data_object_454(self, args):
        data = {}
        data['general'] = {}
        test_datasets = {}
        dataset_counter = {}
        headers = ''
        if self.runobj:
            infile = self.runobj.configPath
        else:            
            infile = args.configPath
            data['general']['input_dir'] = args.input_dir
            #data['general']['output_dir'] = os.path.join(args.output_dir,args.run)
            data['general']['output_dir'] = args.output_dir
            data['general']['platform'] = args.platform
            data['general']['run'] = args.run
            #data['general']['run_date'] = args.run
            data['general']["input_file_format"] = args.input_file_format
            data['general']["input_file_suffix"] = args.input_file_suffix
    
        return data['general']

    

        
    def get_input_files(self):
        
        files_list = []
        
        if os.path.isdir(self.general_config_dict['input_dir']):
            
            for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ):
                if os.path.isdir(infile) == True:
                    
                    for infile2 in glob.glob( os.path.join( infile,'*') ):
                        if os.path.isdir(infile2) == True:
                            pass
                        else:
                            sub_dir = os.path.basename(infile)
                            
                            files_list.append(os.path.join(sub_dir,os.path.basename(infile2)))
                else:
                    files_list.append(os.path.basename(infile))
#        else:
#            if fasta_file:
#                pass
#            logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir'])
            
        return files_list
        
    def check_for_input_files(self, data_object):
    
        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']

            
            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print data_object[x]['file_prefix']
                        
                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])
                        
                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print "No input directory or directory permissions problem: "+input_dir
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print "Files LIST",data_object['general']['files_list']
        
        
        return data_object
        
           
    def check_for_missing_values(self, data):
        missing_key   = ''
        error = False
        warn = False
        for item in data:
            if item == 'general':
                for k,v in data[item].iteritems():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if v == '':                        
                        logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        warn=True
                            
        for item in data:
            if item != 'general':
                for k,v in data[item].iteritems():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:
                        if (k == 'barcode' or k == 'adaptor'): #these could be empty
                            logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        else:
                            logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                            error=True
        return (error,warn)

    def check_for_datasets(self,data):
        error = False
        warn=False
        for item in data:
            if item != 'general':
                #print 'ds',data[item]['dataset']
                if not data[item]['dataset']:
                #if 'dataset' not in data[item]:
                    logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                    error=True
        return (error,warn) 
        
    def check_domain_suite_region(self,data):
        error = False
        warn=False
        
        for item in data:
            
            if item != 'general':
                primer_suite = self.convert_primer_suites(data[item]['primer_suite'])
                dna_region   = self.convert_primer_suites(data[item]['dna_region'])
                
                # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
                if primer_suite not in self.primer_suites:
                    logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in self.dna_regions:
                    logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in primer_suite:
                    logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")")
                    error=True
        return (error, warn)
    
    def convert_primer_suites(self, suite):
        if type(suite) is list:
            conv_suite = [item.lower().translate(None, '_- ') for item in suite]
        if type(suite) is str:
            conv_suite = suite.lower().translate(None, '_- ')
        return conv_suite
        
    def check_project_name(self, data):
        """
        # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                try:
                    (a,b,c) = data[item]['project'].split('_')
                except:
                    logger.error("project not in correct format: "+data[item]['project']+" - Exiting (key: "+data[item]+")")
                    error=True
                (a,b,c) = data[item]['project'].split('_')
                #if c[0] not in [i[0].upper() for i in domains]:
                #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
                if (c[1:] not in self.dna_regions) and (c.lower() not in self.dna_regions):
                    logger.error("Project suffix has incorrect DNA region: "+c+" - Exiting (key: "+data[item]+")")
                    error = True
        return (error,warn)
        
    def check_dataset_name(self,data):
        """
        # CHECK: dataset name can be ONLY alphanumeric and underscore 
                    and cannot start with a number!
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                dataset_name = data[item]['dataset']
                if not re.match("^[A-Za-z0-9_]*$", dataset_name):
                    logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)")
                    error = True
                #if  re.match("^[0-9]", dataset_name):
                 #   logger.error("Dataset name cannot begin with a digit: "+dataset_name)
                  #  error = True
                
        return (error,warn)   
        
        
    def check_projects_and_datasets(self,data):
        self.my_conn     = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()      
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print p 
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")
                
            ds_found_count = 0   
            for d in datasets:
                if datasets[d] == p:
                    
                    #print "\t%s" % (d)
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)      
 
 
    def get_confirmation(self, steps, general_data):
        print "\n"
        for item,value in general_data.iteritems():
            #print len(value)
            if type(value) != bool and len(value) > 80:
                tmp = value.split(',')
                print "%-20s = %s .. %s" % (item,tmp[0],tmp[-1])
            else:
                print "%-20s = %-20s" % (item,value)
        print "\nStep(s) to be performed: \033[1;36m",steps,'\033[0m'
        print "\n"+self.warn_msg+"\n"
        if 'validate' in steps.split(','):
            # print we are done
            sys.exit()
        if PipelneUtils().is_local:
            return 'c'
        else:
            return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ")
        
    def convert_csv_to_ini(self, new_ini_file):
        #print self.args
        from pipeline.get_ini import readCSV
        
        print 'CSV path', self.general_config_dict['csvPath']
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])
        
        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print content[1]
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.iteritems():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])
        
        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")  
        fh.write("[general]\n") 
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")
        
        fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n")
        fh.write("platform = " + self.general_config_dict['platform']+"\n")
        fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] == 'illumina':
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("do_perfect = "          + str(self.general_config_dict['do_perfect'])+"\n")
            fh.write("lane_name = "          + str(self.general_config_dict['lane_name'])+"\n")            
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")
            
        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("site = "              + self.general_config_dict['site']+"\n")
        fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n") 
        else:
            fh.write("input_files = \n") 
        #fh.write(getattr(args,'force_runkey', ""))        
 
        for k, values in content.iteritems():
            fh.write("\n")
            if self.general_config_dict['platform'] == 'illumina':
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")
            
            for v in values:
                if v == "env_sample_source":
                    try:
                        new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0]
                    except:
                        print """There was an error in env_sample_source. Please check your metadata. 
Possible values:  
-----------
air
extreme habitat
host associated
human associated
human-amniotic-fluid
human-blood
human-gut
human-oral
human-skin
human-urine
human-vaginal
indoor
microbial mat/biofilm
miscellaneous_natural_or_artificial_environment
plant associated
sediment
soil/sand
unknown
wastewater/sludge
water-freshwater
water-marine
-----------
"""
                        raise
                    fh.write("env_sample_source_id = "+new_val+"\n")
                else:
                    fh.write(v+" = "+values[v]+"\n")
                
        fh.close()
        
        return new_ini_file 
        
    def save_ini_file(self,new_ini_file):
        # give it a new name
        out_fh = open(new_ini_file,'w')
        #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"):
        #    out_fh.write(line)
        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file
        
        out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")  
        out_fh.write("[general]\n")   
        for item in self.general_config_dict:
            
            out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        #out_fh.write("\n["+self.general_config_dict['platform']+"]\n") 
        #for item in self.general_config_dict:
        #    if item not in C.general_run_items:
        #        out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        
        
        
        if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '':
            (path,fasta) = os.path.split(self.general_config_dict['fasta_file'])
            if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path:
                sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file'])
            
            out_fh.write("input_dir = "+path+"\n")
            out_fh.write("input_files = "+fasta+"\n")
            #out_fh.write("input_file_suffix = fasta\n")
        elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            out_fh.write("input_files = "     + ','.join(file_list)+"\n") 
        else:
            out_fh.write("input_files = \n") 
        out_fh.close()
            
    def check_headers(self, headers):
        if self.general_config_dict['platform']=='illumina':
            known_header_list= self.known_header_list['illumina']
        elif self.general_config_dict['platform'] == '454':
            known_header_list = self.known_header_list['454']
        else:
            logger.error("in utils: check_headers - unknown platform")
        #print   sorted(known_header_list)
        #print sorted(headers)
        self.res_headers = headers
        if "env_sample_source" in headers:
            self.env_source_to_id(headers)
            
        if sorted(known_header_list) != sorted(self.res_headers):
            print "=" * 40
            print "csv file header problem"
            print "%-20s %-20s" % ("REQUIRED", "YOUR CSV")
            for i in sorted(known_header_list):
                if i in headers:
                    print "%-20s%-20s" % (i,i)
                else:
                    print "%-20s%-20s" % (i,"----------- <--- missing")
            for i in headers:
                
                if i not in known_header_list:
                    print "%-20s%-20s" % (" ",i+" <--- extra")
            print "=" * 40
            sys.exit("ERROR : unknown or missing headers\n")
        else:
            return True
        
    def env_source_to_id(self, headers):
        self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454")  
#         self.my_conn     = MyConnection()    
        my_sql       = """SELECT * FROM env_sample_source"""
        self.env     = self.my_conn.execute_fetch_select(my_sql)
        self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]
        
    def configDictionaryFromFile_ini(self, config_file_path):
        import ConfigParser
        
        configDict = {}
        user_config = ConfigParser.ConfigParser()
        user_config.read(config_file_path)
        
        for section in user_config.sections():
            section_dict = configDict[section] = {}
            for option in user_config.options(section):
                section_dict[option] = user_config.get(section,option)
                if section_dict[option] == 'True' or section_dict[option] == 'true':
                    section_dict[option] = True
                elif section_dict[option] == 'False' or section_dict[option] == 'false':
                    section_dict[option] = False
                    
        return configDict
        
    def get_values(self, args, general_config_dict = {} ):
        collector={}

        for item in self.pipeline_run_items[args.platform]:
            
            # set collector[item] to the default first
            collector[item] = self.pipeline_run_items[args.platform][item]
            
            # now look for args (then ini) values to replace
            if item in args and getattr( args, item ) != None:
                collector[item]  = getattr( args, item )
            elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '':
                collector[item]  = general_config_dict[args.platform][item]
        
        # get all the items from general_config_dict['general']
        if 'general' in general_config_dict:
            for item in general_config_dict['general']:
                collector[item]  = general_config_dict['general'][item]
            
               
        return collector
    
    def validate_args(self):
        """
        # THOUGHTS
        # vamps users
        # single project and dataset
        # Supply an ini file OR commandline (for web interface), but no csv file
        #
        # MBL pipeline
        # REQUIRE a csv file and a ini file
        """
        collector={}
        
        if self.args.configPath:
            general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath) 
            if self.args.platform in general_config_dict and 'general' in general_config_dict:
                collector= self.get_values( self.args, general_config_dict)
            else:
                sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.")
        else:
            # no configPath
            collector= self.get_values( self.args )
            
        if self.args.platform == 'illumina':
            print "Starting Illumina Pipeline"
            if not self.args.csvPath:
                sys.exit("illumina requires a csv file - Exiting")
            
        elif self.args.platform == 'vamps':
            print "Starting VAMPS Pipeline:"
            
            if 'project' not in collector or collector['project'] == '':    
                collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:]
            else:
                logger.debug("No project found in vamps pipeline")
            if self.args.fasta_file:
                collector['project'] = self.args.fasta_file
                collector['from_fasta'] = True
        elif self.args.platform == '454':
            print "Starting 454 Pipeline"
            
        elif self.args.platform == 'ion_torrent':
            print "Starting Ion Torrent Pipeline"
            
        else:
            sys.exit("Validate args: Unknown Platform")
        
        if  self.args.configPath:
            collector['configPath'] = self.args.configPath
        else:
            collector['configPath'] = ""
        # these are all the bool items in the collector
        # they need to be converted fron str to bool here
        for i in collector:
            if collector[i] == 'True' or collector[i] == 'true':
                collector[i] = True
            elif collector[i] == 'False' or collector[i] == 'false':
                collector[i] = False
        
        #collector['runcode'] = self.args.run
        collector['run'] = self.args.run
        #collector['run_date'] = self.args.run
        #collector['steps'] = self.args.steps
        collector['platform'] = self.args.platform
        if self.args.input_dir:       
            collector['input_dir'] = self.args.input_dir

        collector['date'] = str(datetime.date.today())
        #print collector
        return collector

Example #8

Show file

File: vamps.py Project: MBL-Woods-Hole/py_mbl_sequencing_pipeline

    def load_database(self, lane_keys):
        """
        
        """
        logger.info("Starting load VAMPS data")
#         self.taxes_file = os.path.join(self.outdir,'vamps_data_cube_uploads.txt')
#         self.summed_taxes_file = os.path.join(self.outdir,'vamps_junk_data_cube_pipe.txt')
#         self.distinct_taxes_file = os.path.join(self.outdir,'vamps_taxonomy_pipe.txt')
#         self.sequences_file = os.path.join(self.outdir,'vamps_sequences_pipe.txt')
#         self.export_file = os.path.join(self.outdir,'vamps_export_pipe.txt')
#         self.projects_datasets_file = os.path.join(self.outdir,'vamps_projects_datasets_pipe.txt')
#         self.projects_info_file = os.path.join(self.outdir,'vamps_projects_info_pipe.txt')
        # USER: vamps_db_tables
        data_cube_table     = 'vamps_data_cube_uploads'
        summed_cube_table   = 'vamps_junk_data_cube_pipe'
        taxonomy_table      = 'vamps_taxonomy_pipe'
        sequences_table      = 'vamps_sequences_pipe'
        exports_table       ='vamps_export_pipe'
        info_table_user     = '******'
        info_table          = 'vamps_projects_info'
        datasets_table      = 'vamps_projects_datasets_pipe'
        users_table         = 'vamps_users'
        
        # We only have a single project and dataset here:
        # if the project is new  then we add the data to the upload_info and projects_datasets_pipe table
        # but if the project is not new:
        #   check if the existing project belongs to the user
        #   if it does then UPDATE the line in upload_info table and add line to projects_datasets_pipe table
        #       (maybe check if dataset already exists and die if yes)
        #   if the existing project doesn't belong to the owner then die with a warning to change project name
        #      (or maybe change the name by adding _user)
        
        if self.runobj.site == 'vamps':
            db_host    = 'vampsdb'
            db_name    = 'vamps'
        else:
            db_host    = 'vampsdev'
            db_name    = 'vamps'
        myconn = MyConnection(host=db_host, db=db_name)
        query = "SELECT project_name from %s where project_name='%s' \
                    UNION \
                 SELECT project_name from %s where project_name='%s' \
                 " % (info_table_user,self.project,info_table,self.project)
                 
        data = myconn.execute_fetch_select(query)
        if data:
            logger.info("found this project "+data[0][0]+" Exiting")
            sys.exit("Duplicate project name found; Canceling upload to database but your GASTed data are here: "+ self.outdir)
        else:
            # project is unknown in database - continue
            
            #
            #  DATA_CUBE
            #
            for line in open(self.taxes_file,'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab
                
                qDataCube = "insert ignore into %s (project, dataset, taxon_string,superkingdom,phylum,class,\
                                            orderx,family,genus,species,strain,rank,knt,frequency,dataset_count,classifier)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (data_cube_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],
                            line[8],line[9],line[10],line[11],line[12],line[13],line[14],line[15])
                myconn.execute_no_fetch(qDataCube)
                
            #
            # SUMMED (JUNK) DATA_CUBE
            #
            for line in open(self.summed_taxes_file,'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab
                #taxonomy        sum_tax_counts  frequency	dataset_count   rank    project dataset project--dataset        classifier
                qSummedCube = "insert ignore into %s (taxon_string,knt, frequency, dataset_count, rank, project, dataset, project_dataset, classifier)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (summed_cube_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8])
                myconn.execute_no_fetch(qSummedCube)    
                    
                    
            #
            #  TAXONOMY
            #
            for line in open(self.distinct_taxes_file,'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab    
                qTaxonomy = "insert ignore into %s (taxon_string,rank,num_kids)\
                            VALUES('%s','%s','%s')" \
                            % (taxonomy_table, line[0],line[1],line[2])
                myconn.execute_no_fetch(qTaxonomy)        
            
            #
            #  SEQUENCES
            #
            for line in open(self.sequences_file,'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab
                # project dataset taxonomy        refhvr_ids	rank    seq_count frequency  distance  read_id project_dataset    
                qSequences = "insert ignore into %s (sequence,project, dataset, taxonomy,refhvr_ids,rank,seq_count,frequency,distance,rep_id, project_dataset)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (sequences_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8],line[9],line[10])
                myconn.execute_no_fetch(qSequences)        
            #
            #  PROJECTS_DATASETS
            #
            for line in open(self.projects_datasets_file,'r'):
                line = line.strip().split("\t")
                # [1:]  # split and remove the leading 'zero'
                if line[0]=='HEADER':
                    continue
                
                qDatasets = "insert ignore into %s (project, dataset, dataset_count,has_tax,date_trimmed,dataset_info)\
                            VALUES('%s','%s','%s','%s','%s','%s')" \
                            % (datasets_table,
                            line[0],line[1],line[2],line[3],line[4],line[5])
                myconn.execute_no_fetch(qDatasets) 
            
            #
            # INFO
            #
            for line in open(self.projects_info_file,'r'):
                line = line.strip().split("\t")
                #[1:]  # split on tab and remove the leading 'zero'
                if line[0]=='HEADER':
                    continue
                
                qInfo = "insert into %s (project_name, title, description, contact, email, institution, user, env_source_id)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (info_table_user,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7])
                myconn.execute_no_fetch(qInfo) 
                
            #
            # USERS
            #
                
            qUser = "******" \
                        % (users_table, self.project, self.runobj.user)
            myconn.execute_no_fetch(qUser) 
            
            
        
        
        
        
        
        
        
        logger.info("Finished load VAMPS data")

Example #9

Show file

File: metadata.py Project: MBL-Woods-Hole/py_mbl_sequencing_pipeline

class MetadataUtils:
    """
    Class to read metadata files (csv and ini style)
    validate and create a dictionary from them
    Two parts: 
    1) From pipeline-ui.py to validate the input args
    2) From runconfig.py to write the final ini file and create the dictionary
    that is used to create the run object
    """
    Name = "MetadataUtils"
    def __init__(self, command_line_args = None, configuration_dictionary = None):
        self.args = command_line_args
        self.general_config_dict = configuration_dictionary
        self.known_header_list  = C.csv_header_list
        self.pipeline_run_items = C.pipeline_run_items
        self.primer_suites      = C.primer_suites 
        self.dna_regions        = C.dna_regions
        self.data_object = {}
        self.data_object['general'] = {}
        self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
        then press 'c' to continue the pipeline\n"""
        
        
            
    def convert_and_save_ini(self):
        
        new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'] + '.ini')
        #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini')
        # converts csv to ini and saves to output_dir
        if self.general_config_dict['platform'] == 'vamps':
            self.save_ini_file(new_ini_file)
        else:
            self.convert_csv_to_ini(new_ini_file)
        self.general_config_dict['configPath']
        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file
        
        # change path and type to new ini
        # regardless of what they were before        
    
    
    
    def validate(self): 
        
        if self.general_config_dict['platform'] == 'illumina':
            self.warn_msg = self.validate_illumina_ini()
        elif self.general_config_dict['platform'] == '454':
            data = self.validate_454_ini()
        elif self.general_config_dict['platform'] == 'ion_torrent':
            pass
        elif self.general_config_dict['platform'] == 'vamps':
            data = self.validate_vamps_ini()
        else:
            sys.exit("Unknown platform and configFile type for validation")
            

        return self.data_object
            
    def get_general_data(self):
        """
        """
        return self.data_object['general']
        
#     def create_dictionary_from_ini(self):
#         """
#         # read an ini config file and convert to a dictionary
#         """
#         import ConfigParser
#         if os.path.exists(self.general_config_dict['configPath']):
#             data_object = {}
#             user_config = ConfigParser.ConfigParser()
#             user_config.read(self.general_config_dict['configPath'])
#             
#             for section in user_config.sections():
#                 
#                 section_dict = data_object[section] = {}
#                 for option in user_config.options(section):
#                     section_dict[option] = user_config.get(section,option)
#                     
#         else:
#             print "error could not open config file: ",self.general_config_dict['configPath']
#         
#         return data_object 

#     def get_command_line_items(self, general_data):
#     
#         # command line items take precedence over ini file items of the same name
#         # defaults should be here and NOT in argparse/commandline
#         if self.args.input_dir:       
#             general_data['input_dir'] = self.args.input_dir
#         else:
#             if not general_data['input_dir']:
#                 general_data['input_dir'] = './'
#         
#         if self.args.run:
#             general_data['run'] = self.args.run
#             general_data['run_date'] = self.args.run
#         else:
#             if 'run' in general_data:                
#                 general_data['run_date'] = general_data['run']
#             elif 'run_date' in general_data:
#                 general_data['run'] = general_data['run_date']
#             else:
#                 sys.exit("Cannot find the run or run_date on command line or in config file - Exiting")
#         # make sure RUN is before OUTPUT_DIR        
#         try:
#             general_data['output_dir'] = os.path.join(self.args.baseoutputdir,self.args.run)
#         except:
#             if 'output_dir' not in general_data:
#                 general_data['output_dir'] = os.path.join('.',self.args.run)       
#         #getattr(args,'force_runkey', "")
#         
#         
#         if self.args.platform:
#             general_data['platform'] = self.args.platform
#         else:
#             if 'platform' not in general_data:
#                 sys.exit("Cannot find the platform from command line or in config file - Exiting")
#                 
#         
#         if self.args.input_file_format:
#             general_data['input_file_format'] = self.args.input_file_format
#         else:
#             if 'input_file_format' not in general_data:
#                 general_data['input_file_format'] = ''
#         if self.args.input_file_suffix:
#             general_data['input_file_suffix'] = self.args.input_file_suffix
#         else:
#             if 'input_file_suffix' not in general_data:
#                 general_data['input_file_suffix'] = ''
#         
#         return general_data
        
#     def validate_454_csv(self, args, my_csv):
#         print "TODO: write validate def for 454/csv"
#         data_object = self.populate_data_object_454(args, my_csv)
        
    def validate_vamps_ini(self):
        # configPath is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])
    def validate_454_ini(self):
        print "TODO - write validation def for 454/ini"
        #self.data_object = self.create_dictionary_from_ini() 
        # 454 ini file requirements:
        
        
        
    def validate_illumina_ini(self):
        """
        The csv headers are checked earlier
        """
        
        print "Validating ini type Config File (may have been converted from csv)"
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print 'configpath',self.general_config_dict['configPath']
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])
        
        
        (error_code,warn_code) = self.check_for_missing_values(self.data_object)  
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print self.data_object['input_dir']
        #print self.data_object['input_files']
        
        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")        
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina':
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True
                        
        if error:
            sys.exit( """\n\tTHERE WERE SEVERE PROBLEMS WITH THE CONFIG FILE - EXITING 
            PLEASE CORRECT THEM AND START OVER.\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn: 
            msg = """\n\tTHERE WERE NON-FATAL PROBLEMS WITH THE CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
        
        return msg
        
    def validate_dictionary(self, config_info):
        """
        This is only used for data that comes in as a dictionary rather than a file
        such as with vamps user uploads
        """
        print "TODO - Validating input dictionary"
        # must be a general section
        # should I create a dict here??? -That would render much code in
        #    runconfig useless.
        # are we going to continue developing ini style config files if
        #   no one uses them?  
        configDict = config_info

        return configDict   
        

















        
    def populate_data_object_454(self, args):
        data = {}
        data['general'] = {}
        test_datasets = {}
        dataset_counter = {}
        headers = ''
        if self.runobj:
            infile = self.runobj.configPath
        else:            
            infile = args.configPath
            data['general']['input_dir'] = args.input_dir
            #data['general']['output_dir'] = os.path.join(args.output_dir,args.run)
            data['general']['output_dir'] = args.output_dir
            data['general']['platform'] = args.platform
            data['general']['run'] = args.run
            #data['general']['run_date'] = args.run
            data['general']["input_file_format"] = args.input_file_format
            data['general']["input_file_suffix"] = args.input_file_suffix
    
        return data['general']

    
#     def populate_data_object_illumina(self, args, my_csv):
#         data = {}
#         data['general'] = {}
#         test_datasets = {}
#         dataset_counter = {}
#         headers = ''
#         if self.run:
#             infile = self.run.configPath
#             data['general']['input_dir'] = self.run.input_dir
#             #megadata['general']['output_dir'] = self.args.output_dir
#             data['general']['platform'] = self.run.platform
#             data['general']['run'] = self.run.run_date
#             #data['general']['run_date'] = self.run.run_date
#             #megadata['general']['run'] = self.args.run
#             data['general']["input_file_format"] = self.run.input_file_format
#             #input_dir,"/xraid2-2/sequencing/Illumina/20120525_recalled/Project_Sandra_v6/analysis/"
#             data['general']["input_file_suffix"] = self.run.input_file_suffix
#         else:            
#             infile = args.configPath
#             data['general']['input_dir'] = args.input_dir
#             #data['general']['output_dir'] = os.path.join(args.output_dir,args.run)
#             data['general']['output_dir'] = args.output_dir
#             data['general']['platform'] = args.platform
#             data['general']['run'] = args.run
#             #data['general']['run_date'] = args.run
#             #megadata['general']['run'] = self.args.run
#             data['general']["input_file_format"] = args.input_file_format
#             #input_dir,"/xraid2-2/sequencing/Illumina/20120525_recalled/Project_Sandra_v6/analysis/"
#             data['general']["input_file_suffix"] = args.input_file_suffix
#             
#         print "Validating csv type ConfigFile"
#         
#         # changes spaces to '_' and all lowercase
# 
#         temp = {}   
# 
#         
# #        my_read_csv = readCSV(file_path = infile)
# #        my_read_csv.put_run_info()
# #        print "content[1].keys(): "
# #        print content[1].keys()
# #        # To see the list of statistics available for each line
# #        for k, v in content.items():
# #            print k, v['dataset'], v 
#         content     = my_csv.read_csv()
#         headers     = content[1].keys()
#         headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
#         projects = {}
#         if self.check_headers(headers_clean):
# 
# #
# #                try:
# #                    temp[headers[n]] = lst[n]
# #                except:
# #                    sys.exit("ERROR:It looks like the header count and the data column count are different.")
#             for k, v in content.items():
#                 run_key = v['run_key'].replace('N','').upper()
#                 temp['file_prefix'] = v['dataset']+'_'+ run_key
# #                print "v = %s\n" % v
# #                v = {'barcode_index': 'ATCACG', 'project': 'JCR_SPO_Bv6', 'lane': '3', 'run': '20120613', 'dna_region': 'v6', 'adaptor': '', 
# #                      'barcode': '', 'seq_operator': 'JV', 'overlap': 'complete', 'dataset': 'H40', 'run_key': 'NNNNACGCA', 'read_length': '101', 
# #                       'file_prefix': 'H40', 'data_owner': 'jreveillaud', 'primer_suite': 'Bacterial v6 Suite', 'tubelabel': 'H40', 'amp_operator': 'JR', 'insert_size': '230'}; 
# #                        temp['file_prefix'] = H40_
#                 unique_identifier   = v['barcode_index']+'_'+run_key+'_'+v['lane']
#                 data[unique_identifier] = {}
#                 if unique_identifier in test_datasets:
#                     sys.exit("ERROR: duplicate run_key:barcode_index:lane: "+unique_identifier+" - Exiting")
#                 else:
#                     test_datasets[unique_identifier] = 1
# #                print "test_datasets = %s;\ntemp['file_prefix'] = %s\nunique_identifier = %s" % (test_datasets,temp['file_prefix'], unique_identifier)
#                 
#                 data[unique_identifier]['dataset'] = v['dataset']
#                 data[unique_identifier]['project'] = v['project']
#                 
#                 if v['project'] in dataset_counter:
#                     dataset_counter[v['project']] += 1
#                 else:
#                     dataset_counter[v['project']] = 1
#                 
#                 #megadata[unique_identifier]['ds_count'] = 1
#                 data[unique_identifier]['project']              = v['project']
#                 data[unique_identifier]['run_key']              = v['run_key']
#                 data[unique_identifier]['lane']                 = v['lane']
#                 data[unique_identifier]['tubelabel']            = v['tubelabel']
#                 data[unique_identifier]['barcode']              = v['barcode']
#                 data[unique_identifier]['adaptor']              = v['adaptor']
#                 data[unique_identifier]['dna_region']           = v['dna_region']
#                 data[unique_identifier]['amp_operator']         = v['amp_operator']
#                 data[unique_identifier]['seq_operator']         = v['seq_operator']
#                 data[unique_identifier]['barcode_index']        = v['barcode_index']
#                 data[unique_identifier]['overlap']              = v['overlap']
#                 data[unique_identifier]['insert_size']          = v['insert_size']
#                 data[unique_identifier]['file_prefix']          = v['file_prefix']
#                 data[unique_identifier]['read_length']          = v['read_length']
#                 data[unique_identifier]['primer_suite']         = v['primer_suite']
#                 data[unique_identifier]['first_name']           = v['first_name']
#                 data[unique_identifier]['last_name']            = v['last_name']
#                 data[unique_identifier]['email']                = v['email']
#                 data[unique_identifier]['institution']          = v['institution']
#                 data[unique_identifier]['project_title']        = v['project_title']
#                 data[unique_identifier]['project_description']  = v['project_description']
#                 data[unique_identifier]['funding']              = v['funding']
#                 data[unique_identifier]['env_sample_source']    = v['env_sample_source']
#                 data[unique_identifier]['dataset_description']  = v['dataset_description']
#         for item in data:
#             if item != 'general':
#                 data[item]['primer_suite']  = data[item]['primer_suite'].lower().replace(" ", "_")
#                 data[item]['dna_region']    = data[item]['dna_region'].lower().replace(" ", "_")
#                 data[item]['barcode']       = data[item]['barcode'].upper()
#                 data[item]['barcode_index'] = data[item]['barcode_index'].upper()
#                 data[item]['ds_count']      = str(dataset_counter[data[item]['project']])
#         
#              
#         return data
    
        
    def get_input_files(self):
    
        files_list = []
        print self.general_config_dict['input_dir']
        if os.path.isdir(self.general_config_dict['input_dir']):
            
            for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ):
                if os.path.isdir(infile) == True:
                    pass
                else:
                    files_list.append(os.path.basename(infile))
        else:
            if fasta_file:
                pass
            logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir'])
            
        return files_list
        
    def check_for_input_files(self,data_object):
    
        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']

            
            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print data_object[x]['file_prefix']
                        
                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])
                        
                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print "No input directory or directory permissions problem: "+input_dir
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print "Files LIST",data_object['general']['files_list']
        
        
        return data_object
        
           
    def check_for_missing_values(self, data):
        missing_key   = ''
        error = False
        warn = False
        for item in data:
            if item == 'general':
                for k,v in data[item].iteritems():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:                        
                        logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        warn=True
                            
        for item in data:
            if item != 'general':
                for k,v in data[item].iteritems():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:
                        if (k == 'barcode' or k == 'adaptor'): #these could be empty
                            logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        else:
                            logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                            error=True
        return (error,warn)

    def check_for_datasets(self,data):
        error = False
        warn=False
        for item in data:
            if item != 'general':
                #print 'ds',data[item]['dataset']
                if not data[item]['dataset']:
                #if 'dataset' not in data[item]:
                    logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                    error=True
        return (error,warn) 
        
    def check_domain_suite_region(self,data):
        error = False
        warn=False
        
        for item in data:
            
            if item != 'general':
                # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
                if data[item]['primer_suite'] not in self.primer_suites:
                    logger.error("Primer Suite not found: "+data[item]['primer_suite']+" - Exiting (key: "+item+")")
                    error=True
                #if dataset_items['domain'] not in domains:
                #   sys.exit("ERROR: Domain not found: "+dataset_items['domain'])
                if data[item]['dna_region'] not in self.dna_regions:
                    logger.error("DNA Region not found: "+data[item]['dna_region']+" - Exiting (key: "+item+")")
                    error=True
                # "Bacterial v6","BacterialV6Suite","v6"
                #if dataset_items['domain'][:6] != dataset_items['primer_suite'][:6]:
                #    sys.exit("ERROR: Domain ("+dataset_items['domain']+") -- Primer Suite ("+dataset_items['primer_suite']+") mismatch.")
                #if dataset_items['domain'][-2:].lower() != dataset_items['dna_region'].lower():
                #    sys.exit("ERROR: DNA Region ("+dataset_items['dna_region']+") -- Domain ("+dataset_items['domain']+") mismatch.")
                if data[item]['dna_region'] not in data[item]['primer_suite']:
                    logger.error("DNA Region ("+data[item]['dna_region']+") not found in Primer Suite ("+data[item]['primer_suite']+") - Exiting (key: "+item+")")
                    error=True
        return (error,warn)
        
    def check_project_name(self,data):
        """
        # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                try:
                    (a,b,c) = data[item]['project'].split('_')
                except:
                    logger.error("project not in correct format: "+data[item]['project']+" - Exiting (key: "+data[item]+")")
                    error=True
                (a,b,c) = data[item]['project'].split('_')
                #if c[0] not in [i[0].upper() for i in domains]:
                #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
                if c[1:] not in self.dna_regions:
                    logger.error("Project suffix has incorrect DNA region: "+c+" - Exiting (key: "+data[item]+")")
                    error = True
        return (error,warn)
            
    def check_projects_and_datasets(self,data):
        self.my_conn     = MyConnection(host='newbpcdb2', db="env454")  
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print p 
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")
                
            ds_found_count = 0   
            for d in datasets:
                if datasets[d] == p:
                    
                    #print "\t%s" % (d)
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)      
 
 
    def get_confirmation(self, steps, general_data):
        print "\n"
        for item,value in general_data.iteritems():
            #print len(value)
            if type(value) != bool and len(value) > 80:
                tmp = value.split(',')
                print "%20s = %s .. %s" % (item,tmp[0],tmp[-1])
            else:
                print "%20s = %-20s" % (item,value)
        print "\nStep(s) to be performed: ",steps
        print "\n"+self.warn_msg+"\n"
        if 'validate' in steps.split(','):
            # print we are done
            sys.exit()
        print os.uname()
        print os.uname()[1]
        if os.uname()[1] == 'ashipunova.mbl.edu' or os.uname()[1] == 'as-macbook.local':
            return "c"
        else:
            return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ")
        
    def convert_csv_to_ini(self,new_ini_file):
        #print self.args
        from pipeline.get_ini import readCSV
        
        print 'CSV path',self.general_config_dict['csvPath']
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])
        
        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print content[1]
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.iteritems():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])
        
        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")  
        fh.write("[general]\n") 
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")
        
        fh.write("configPath_orig = "+self.general_config_dict['configPath']+"\n")
        fh.write("platform = "+self.general_config_dict['platform']+"\n")
        fh.write("output_dir = "          + self.general_config_dict['output_dir']+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] == 'illumina':
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")
            
        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n") 
        else:
            fh.write("input_files = \n") 
        #fh.write(getattr(args,'force_runkey', ""))        
        
        for k,values in content.iteritems():
            fh.write("\n")
            if self.general_config_dict['platform'] == 'illumina':
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")
                
            for v in values:
                fh.write(v+" = "+values[v]+"\n")
                
        fh.close()
        
        return new_ini_file 
        
    def save_ini_file(self,new_ini_file):
        # give it a new name
        out_fh = open(new_ini_file,'w')
        #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"):
        #    out_fh.write(line)
        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file
        
        out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")  
        out_fh.write("[general]\n")   
        for item in self.general_config_dict:
            
            out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        #out_fh.write("\n["+self.general_config_dict['platform']+"]\n") 
        #for item in self.general_config_dict:
        #    if item not in C.general_run_items:
        #        out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        
        
        
        if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '':
            (path,fasta) = os.path.split(self.general_config_dict['fasta_file'])
            if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path:
                sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file'])
            
            out_fh.write("input_dir = "+path+"\n")
            out_fh.write("input_files = "+fasta+"\n")
            #out_fh.write("input_file_suffix = fasta\n")
        elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            out_fh.write("input_files = "     + ','.join(file_list)+"\n") 
        else:
            out_fh.write("input_files = \n") 
        out_fh.close()


            
    def check_headers(self,headers):
        if self.general_config_dict['platform']=='illumina':
            known_header_list= self.known_header_list['illumina']
        elif self.general_config_dict['platform'] == '454':
            known_header_list = self.known_header_list['454']
        else:
            logger.error("in utils: check_headers - unknown platform")
        #print   sorted(known_header_list)
        #print sorted(headers)
        if sorted(known_header_list) != sorted(headers):
            print "="*40
            print "csv file header problem"
            print "%-20s %-20s" % ("REQUIRED", "YOUR CSV")
            for i in sorted(known_header_list):
                if i in headers:
                    print "%-20s%-20s" % (i,i)
                else:
                    print "%-20s%-20s" % (i,"----------- <--- missing")
            for i in headers:
                
                if i not in known_header_list:
                    print "%-20s%-20s" % (" ",i+" <--- extra")
            print "="*40
            sys.exit("ERROR : unknown or missing headers\n")
        else:
            return True

    def configDictionaryFromFile_ini(self,config_file_path):
        import ConfigParser
        
        configDict = {}
        user_config = ConfigParser.ConfigParser()
        user_config.read(config_file_path)
        
        for section in user_config.sections():
            section_dict = configDict[section] = {}
            for option in user_config.options(section):
                section_dict[option] = user_config.get(section,option)
                if section_dict[option] == 'True' or section_dict[option] == 'true':
                    section_dict[option] = True
                elif section_dict[option] == 'False' or section_dict[option] == 'false':
                    section_dict[option] = False
                    
        return configDict
        
    def get_values(self, args, general_config_dict = {} ):
        collector={}

        for item in self.pipeline_run_items[args.platform]:
            
            # set collector[item] to the default first
            collector[item] = self.pipeline_run_items[args.platform][item]
            
            # now look for args (then ini) values to replace
            if item in args and getattr( args, item ) != None:
                collector[item]  = getattr( args, item )
            elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '':
                collector[item]  = general_config_dict[args.platform][item]
        
        # get all the items from general_config_dict['general']
        if 'general' in general_config_dict:
            for item in general_config_dict['general']:
                collector[item]  = general_config_dict['general'][item]
            
               
        return collector
    
    def validate_args(self):
        """
        # THOUGHTS
        # vamps users
        # single project and dataset
        # Supply an ini file OR commandline (for web interface), but no csv file
        #
        # MBL pipeline
        # REQUIRE a csv file and a ini file
        """
        collector={}
        
        if self.args.configPath:
            general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath) 
            if self.args.platform in general_config_dict and 'general' in general_config_dict:
                collector= self.get_values( self.args, general_config_dict)
            else:
                sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.")
        else:
            # no configPath
            collector= self.get_values( self.args )
            
        if self.args.platform == 'illumina':
            print "Illumina Pipeline"
            if not self.args.csvPath:
                sys.exit("illumina requires a csv file - Exiting")
            
        elif self.args.platform == 'vamps':
            print "VAMPS Pipeline:"
            
            if 'project' not in collector or collector['project'] == '':    
                collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:]
            else:
                logger.debug("No project found in vamps pipeline")
            if self.args.fasta_file:
                collector['project'] = self.args.fasta_file
                collector['from_fasta'] = True
        elif self.args.platform == '454':
            print "454 Pipeline"
            
        elif self.args.platform == 'ion_torrent':
            print "Ion Torrent Pipeline"
            
        else:
            sys.exit("Validate args: Unknown Platform")
        
        if  self.args.configPath:
            collector['configPath'] = self.args.configPath
        else:
            collector['configPath'] = ""
        # these are all the bool items in the collector
        # they need to be converted fron str to bool here
        for i in collector:
            if collector[i] == 'True' or collector[i] == 'true':
                collector[i] = True
            elif collector[i] == 'False' or collector[i] == 'false':
                collector[i] = False
        
        #collector['runcode'] = self.args.run
        collector['run'] = self.args.run
        #collector['run_date'] = self.args.run
        #collector['steps'] = self.args.steps
        collector['platform'] = self.args.platform
        if self.args.input_dir:       
             collector['input_dir'] = self.args.input_dir

        collector['date'] = str(datetime.date.today())
        print collector
        return collector

Example #10

Show file

File: metadata.py Project: avoorhis/py_mbl_sequencing_pipeline

class MetadataUtils:
    """
    Class to read metadata files (csv and ini style)
    validate and create a dictionary from them
    Two parts:
    1) From pipeline-ui.py to validate the input args
    2) From runconfig.py to write the final ini file and create the dictionary
    that is used to create the run object
    """
    Name = "MetadataUtils"
    def __init__(self, command_line_args = None, configuration_dictionary = None):
        self.args = command_line_args
        self.general_config_dict = configuration_dictionary
        self.known_header_list  = C.csv_header_list
        self.pipeline_run_items = C.pipeline_run_items
        self.primer_suites      = self.convert_primer_suites(C.primer_suites)
        self.dna_regions        = C.dna_regions
        self.data_object = {}
        self.data_object['general'] = {}
        self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
        then press 'c' to continue the pipeline\n"""
        self.res_headers = []
        self.env = {}
        self.utils  = PipelneUtils()


    def convert_and_save_ini(self, analysis_dir):

        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini')
        # converts csv to ini and saves to output_dir
        if self.general_config_dict['platform'] == 'vamps':
            self.save_ini_file(new_ini_file)
        else:
            self.convert_csv_to_ini(new_ini_file)
        self.general_config_dict['configPath'] = new_ini_file

        # change path and type to new ini
        # regardless of what they were before



    def validate(self, analysis_dir):

        if self.general_config_dict['platform'] in C.illumina_list:
            self.warn_msg = self.validate_illumina_ini(analysis_dir)
        elif self.general_config_dict['platform'] == '454':
            data = self.validate_454_ini(analysis_dir)
        elif self.general_config_dict['platform'] == 'ion_torrent':
            pass
        elif self.general_config_dict['platform'] == 'vamps':
            data = self.validate_vamps_ini(analysis_dir)
        else:
            sys.exit("Unknown platform and configFile type for validation")


        return self.data_object

    def get_general_data(self):
        """
        """
        return self.data_object['general']

    def validate_vamps_ini(self, analysis_dir):
        # configPath is the new configPath
        'todo: Andy, what should be here, just directory name or directory + number.ini?'
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])
        if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] )
        elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] )

    def validate_454_ini(self, analysis_dir):
        print("TODO - write validation def for 454/ini")
        #self.data_object = self.create_dictionary_from_ini()
        # 454 ini file requirements:



    def validate_illumina_ini(self, analysis_dir):
        """
        The csv headers are checked earlier
        """

        print("Validating ini type Config File (may have been converted from csv)")
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        print("New ini file location: "+new_ini_file)
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print('configpath',self.general_config_dict['configPath'])
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])


        (error_code,warn_code) = self.check_for_missing_values(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_dataset_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print(self.data_object['input_dir'])
        #print(self.data_object['input_files'])


        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list:
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True

        if error:
            sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING
            PLEASE CORRECT THEM AND START OVER.\033[0m\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn:
            msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
            print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m")
        else:
            print("\033[92mCSV File Passed Vaidation!\033[0m")
        return msg

    def validate_dictionary(self, config_info):
        """
        This is only used for data that comes in as a dictionary rather than a file
        such as with vamps user uploads
        """
        print("TODO - Validating input dictionary")
        # must be a general section
        # should I create a dict here??? -That would render much code in
        #    runconfig useless.
        # are we going to continue developing ini style config files if
        #   no one uses them?
        configDict = config_info

        return configDict




    def populate_data_object_454(self, args):
        data = {}
        data['general'] = {}
        test_datasets = {}
        dataset_counter = {}
        headers = ''
        if self.runobj:
            infile = self.runobj.configPath
        else:
            infile = args.configPath
            data['general']['input_dir'] = args.input_dir
            #data['general']['output_dir'] = os.path.join(args.output_dir,args.run)
            data['general']['output_dir'] = args.output_dir
            data['general']['platform'] = args.platform
            data['general']['run'] = args.run
            #data['general']['run_date'] = args.run
            data['general']["input_file_format"] = args.input_file_format
            data['general']["input_file_suffix"] = args.input_file_suffix

        return data['general']




    def get_input_files(self):

        files_list = []

        if os.path.isdir(self.general_config_dict['input_dir']):

            for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ):
                if os.path.isdir(infile) == True:

                    for infile2 in glob.glob( os.path.join( infile,'*') ):
                        if os.path.isdir(infile2) == True:
                            pass
                        else:
                            sub_dir = os.path.basename(infile)

                            files_list.append(os.path.join(sub_dir,os.path.basename(infile2)))
                else:
                    files_list.append(os.path.basename(infile))
#        else:
#            if fasta_file:
#                pass
#            logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir'])

        return files_list

    def check_for_input_files(self, data_object):

        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']


            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print(data_object[x]['file_prefix'])

                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])

                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print("No input directory or directory permissions problem: "+input_dir)
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print("Files LIST",data_object['general']['files_list'])


        return data_object


    def check_for_missing_values(self, data):
        missing_key   = ''
        error = False
        warn = False
        for item in data:
            if item == 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if v == '':
                        logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        warn=True

        for item in data:
            if item != 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:
                        if (k == 'barcode' or k == 'adaptor'): #these could be empty
                            logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        else:
                            logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                            error=True
        return (error,warn)

    def check_for_datasets(self,data):
        error = False
        warn=False
        for item in data:
            if item != 'general':
                #print('ds',data[item]['dataset'])
                if not data[item]['dataset']:
                #if 'dataset' not in data[item]:
                    logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                    error=True
        return (error,warn)

    def check_domain_suite_region(self,data):
        error = False
        warn=False

        for item in data:

            if item != 'general':
                primer_suite = self.convert_primer_suites(data[item]['primer_suite'])
                dna_region   = self.convert_primer_suites(data[item]['dna_region'])

                # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
                if primer_suite not in self.primer_suites:
                    logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in self.dna_regions:
                    logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in primer_suite:
                    logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")")
                    error=True
        return (error, warn)

    def convert_primer_suites(self, suite):
        import re
        if type(suite) is list:
            conv_suite = [re.sub(r'[_ -]', '', item.lower()) for item in suite]
        if type(suite) is str:
            conv_suite = re.sub(r'[_ -]', '', suite.lower())
                # suite.lower().translate(None, '_- ')
        return conv_suite

    def check_project_name(self, data):
        """
        # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                try:
                    (a,b,c) = data[item]['project'].split('_')
                except:
                    logger.error("project not in correct format: ")
                    logger.error(data[item]['project'])
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error=True
                (a,b,c) = data[item]['project'].split('_')
                #if c[0] not in [i[0].upper() for i in domains]:
                #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
                # logger.error("c[1:] = ")
                # logger.error(c[1:])
                # logger.error("c.lower() =")
                # logger.error(c.lower())
                # logger.error("self.dna_regions")
                # logger.error(self.dna_regions )

                if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions):
                    logger.error("Project suffix has incorrect DNA region: ")
                    logger.error(c)
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error = True
        return (error, warn)

    def check_dataset_name(self,data):
        """
        # CHECK: dataset name can be ONLY alphanumeric and underscore
                    and cannot start with a number!
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                dataset_name = data[item]['dataset']
                if not re.match("^[A-Za-z0-9_]*$", dataset_name):
                    logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)")
                    error = True
                #if  re.match("^[0-9]", dataset_name):
                 #   logger.error("Dataset name cannot begin with a digit: "+dataset_name)
                  #  error = True

        return (error, warn)

    def get_my_conn(self):
        try:
            host = self.general_config_dict['database_host']
        except:
            raise
        try:
            db = self.general_config_dict['database_name']
        except:
            raise
        if self.utils.is_local():
            host = 'localhost'
            db   = "test_env454"

        self.my_conn = MyConnection(host = host, db = db)

    def check_projects_and_datasets(self, data):
        self.get_my_conn()
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print(p)
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")

            ds_found_count = 0
            for d in datasets:
                if datasets[d] == p:

                    #print("\t%s" % (d))
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)


    def get_confirmation(self, steps, general_data):
        print("\n")
        for item,value in general_data.items():
            #print(len(value))
            if type(value) != bool and len(value) > 80:
                tmp = value.split(',')
                print("%-20s = %s .. %s" % (item,tmp[0],tmp[-1]))
            else:
                print("%-20s = %-20s" % (item,value))
        print("\nStep(s) to be performed: \033[1;36m",steps,'\033[0m')
        print("\n"+self.warn_msg+"\n")
        if 'validate' in steps.split(','):
            # print(we are done)
            sys.exit()
        if self.utils.is_local():
            return 'c'
        else:
            return 'c'

            # return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ")

    def convert_csv_to_ini(self, new_ini_file):
        #print(self.args)
        from pipeline.get_ini import readCSV

        print('CSV path', self.general_config_dict['csvPath'])
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])

        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print(content[1])
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.items():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])

        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        fh.write("[general]\n")
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")

        fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n")
        fh.write("platform = " + self.general_config_dict['platform']+"\n")
        fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] in C.illumina_list:
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("do_perfect = "          + str(self.general_config_dict['do_perfect'])+"\n")
            fh.write("lane_name = "          + str(self.general_config_dict['lane_name'])+"\n")
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")

        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("site = "              + self.general_config_dict['site']+"\n")
        fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            fh.write("input_files = \n")
        #fh.write(getattr(args,'force_runkey', ""))

        for k, values in content.items():
            fh.write("\n")
            if self.general_config_dict['platform'] in C.illumina_list:
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")

            for v in values:
                if v == "env_sample_source":
                    try:
                        new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0]
                    except:
                        text = """There was an error in env_sample_source. Please check your metadata.
Possible values:
-----------
air
extreme habitat
host associated
human associated
human-amniotic-fluid
human-blood
human-gut
human-oral
human-skin
human-urine
human-vaginal
indoor
microbial mat/biofilm
miscellaneous_natural_or_artificial_environment
plant associated
sediment
soil/sand
unknown
wastewater/sludge
water-freshwater
water-marine
-----------
"""
                        print(text)
                        raise
                    fh.write("env_sample_source_id = "+new_val+"\n")
                else:
                    fh.write(v+" = "+values[v]+"\n")

        fh.close()

        return new_ini_file

    def save_ini_file(self,new_ini_file):
        # give it a new name
        out_fh = open(new_ini_file,'w')
        #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"):
        #    out_fh.write(line)
        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file

        out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        out_fh.write("[general]\n")
        for item in self.general_config_dict:

            out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        #out_fh.write("\n["+self.general_config_dict['platform']+"]\n")
        #for item in self.general_config_dict:
        #    if item not in C.general_run_items:
        #        out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")



        if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '':
            (path,fasta) = os.path.split(self.general_config_dict['fasta_file'])
            if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path:
                sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file'])

            out_fh.write("input_dir = "+path+"\n")
            out_fh.write("input_files = "+fasta+"\n")
            #out_fh.write("input_file_suffix = fasta\n")
        elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            out_fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            out_fh.write("input_files = \n")
        out_fh.close()

    def check_headers(self, headers):
        if self.general_config_dict['platform'] in C.illumina_list:
            pl = self.general_config_dict['platform']
            known_header_list = self.known_header_list[pl]
        elif self.general_config_dict['platform'] == '454':
            known_header_list = self.known_header_list['454']
        else:
            logger.error("in utils: check_headers - unknown platform")
        #print(  sorted(known_header_list))
        #print(sorted(headers))
        self.res_headers = headers
        if "env_sample_source" in headers:
            self.env_source_to_id(headers)

        if sorted(known_header_list) != sorted(self.res_headers):
            print("=" * 40)
            print("csv file header problem")
            print("%-20s %-20s" % ("REQUIRED", "YOUR CSV"))
            for i in sorted(known_header_list):
                if i in headers:
                    print("%-20s%-20s" % (i,i))
                else:
                    print("%-20s%-20s" % (i,"----------- <--- missing"))
            for i in headers:

                if i not in known_header_list:
                    print("%-20s%-20s" % (" ",i+" <--- extra"))
            print("=" * 40)
            sys.exit("ERROR : unknown or missing headers\n")
        else:
            return True

    def env_source_to_id(self, headers):
        logger.error("self.utils.is_local() LLL2 metadata")
        logger.error(self.utils.is_local())
        if self.utils.is_local():
            self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        else:
            self.my_conn = MyConnection(host='bpcdb1', db="env454")
        # self.my_conn     = MyConnection()
        my_sql       = """SELECT * FROM env_sample_source"""
        self.env     = self.my_conn.execute_fetch_select(my_sql)
        self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]

    def configDictionaryFromFile_ini(self, config_file_path):
        import configparser

        configDict = {}
        user_config = configparser.ConfigParser()
        user_config.read(config_file_path)

        for section in user_config.sections():
            section_dict = configDict[section] = {}
            for option in user_config.options(section):
                section_dict[option] = user_config.get(section,option)
                if section_dict[option] == 'True' or section_dict[option] == 'true':
                    section_dict[option] = True
                elif section_dict[option] == 'False' or section_dict[option] == 'false':
                    section_dict[option] = False

        return configDict

    def get_values(self, args, general_config_dict = {} ):
        collector={}

        for item in self.pipeline_run_items[args.platform]:

            # set collector[item] to the default first
            collector[item] = self.pipeline_run_items[args.platform][item]

            # now look for args (then ini) values to replace
            if item in args and getattr( args, item ) != None:
                collector[item]  = getattr( args, item )
            elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '':
                collector[item]  = general_config_dict[args.platform][item]

        # get all the items from general_config_dict['general']
        if 'general' in general_config_dict:
            for item in general_config_dict['general']:
                collector[item]  = general_config_dict['general'][item]


        return collector

    def validate_args(self):
        """
        # THOUGHTS
        # vamps users
        # single project and dataset
        # Supply an ini file OR commandline (for web interface), but no csv file
        #
        # MBL pipeline
        # REQUIRE a csv file and a ini file
        """
        collector={}

        if self.args.configPath:
            general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath)
            if self.args.platform in general_config_dict and 'general' in general_config_dict:
                collector= self.get_values( self.args, general_config_dict)
            else:
                sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.")
        else:
            # no configPath
            collector= self.get_values( self.args )

        collector['current_db_host_name'] = self.utils.find_in_nested_dict(C.db_cnf, {'host': collector['database_host'], 'db': collector['database_name']})
        if not collector['current_db_host_name']:
            sys.exit("""Please check -db_host and -db_name parameters, 
            the current combination does not exist: 'db_host' = %s, 'db_name' = %s """ % (collector['database_host'], collector['database_name']))

        if self.args.platform in C.illumina_list:
            print("Starting Illumina Pipeline")
            if not self.args.csvPath:
                sys.exit("illumina requires a csv file - Exiting")

        elif self.args.platform == 'vamps':
            print("Starting VAMPS Pipeline:")

            if 'project' not in collector or collector['project'] == '':
                collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:]
            else:
                logger.debug("No project found in vamps pipeline")
            if self.args.fasta_file:
                collector['project'] = self.args.fasta_file
                collector['from_fasta'] = True
        elif self.args.platform == '454':
            print("Starting 454 Pipeline")

        elif self.args.platform == 'ion_torrent':
            print("Starting Ion Torrent Pipeline")

        else:
            sys.exit("Validate args: Unknown Platform")

        if  self.args.configPath:
            collector['configPath'] = self.args.configPath
        else:
            collector['configPath'] = ""
        # these are all the bool items in the collector
        # they need to be converted from str to bool here
        for i in collector:
            if collector[i] == 'True' or collector[i] == 'true':
                collector[i] = True
            elif collector[i] == 'False' or collector[i] == 'false':
                collector[i] = False

        #collector['runcode'] = self.args.run
        collector['run'] = self.args.run
        #collector['run_date'] = self.args.run
        #collector['steps'] = self.args.steps
        collector['platform'] = self.args.platform
        if self.args.input_dir:
            collector['input_dir'] = self.args.input_dir

        collector['date'] = str(datetime.date.today())
        #print(collector)
        return collector