def setUpReverse(self):
     self.configPath = "test/data/trim_test_reverse.ini"
     m=MetadataUtils(self)
     config_dict = m.create_dictionary_from_ini()
     self.run = Run(config_dict, self, self.baseoutputdir)
     process(self.run,"trim")
     self.expected = self.get_expected_results('test/data/test_trim_reverse.results')
 def setUpReverse(self):
     self.configPath = "test/data/trim_test_reverse.ini"
     m = MetadataUtils(self)
     config_dict = m.create_dictionary_from_ini()
     self.run = Run(config_dict, self, self.baseoutputdir)
     process(self.run, "trim")
     self.expected = self.get_expected_results(
         'test/data/test_trim_reverse.results')
 def setUpForward(self):
     self.configPath = "test/data/trim_test_forward.ini"
     m=MetadataUtils(self)
     config_dict = m.create_dictionary_from_ini()
     print 'configDict',config_dict
     self.run = Run(config_dict, self, self.baseoutputdir)
     process(self.run,"trim")
     self.expected = self.get_expected_results('test/data/test_trim_forward.results')
 def setUpForward(self):
     self.configPath = "test/data/trim_test_forward.ini"
     m = MetadataUtils(self)
     config_dict = m.create_dictionary_from_ini()
     print 'configDict', config_dict
     self.run = Run(config_dict, self, self.baseoutputdir)
     process(self.run, "trim")
     self.expected = self.get_expected_results(
         'test/data/test_trim_forward.results')
        # view CONFIG file contents
        fh = open(os.path.join(dirs.analysis_dir,  data_object['general']['run']+'.ini'))
        lines = fh.readlines()
        logger.debug("\n=== START ===\n")
        for line in lines:
            line = line.strip()
            logger.debug("line in INI: ")
            logger.debug(line)
        logger.debug("==== END ====\n")
        sys.exit()
    elif answer != 'c':
        sys.exit()
    ##############
    #
    # CREATE THE RUN OBJECT (see runconfig.py for details)
    #
    ##############
    runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__)))


#    for key in run.samples:
#        print(key,run.samples[key].dataset)
#    sys.exit()
    ##############
    #
    # now do all the work
    #
    ##############
    process(runobj, args.steps)

def trim_file(myobject):
    """
      Doc string
    """
    require_distal  = myobject['require_distal']
    minlength       = myobject['minlength']
    maxlength       = myobject['maxlength']
    user            = myobject['user']
    runcode         = myobject['runcode']
    site            = myobject['site']
    file_type       = myobject['file_type']
    file_base       = myobject['file_base']
    datetime        = myobject['datetime']
    use_cluster         = myobject['use_cluster']
    
    primers_obj = get_primers(file_base)
    metadata_obj = get_metadata(file_base)
    # use the files from file_base directory
    # but we get the primers and keys from the database
    # which were stored there during the loading phase
    
    
    if file_type == 'fasta' or file_type == 'fasta_clean':
        # if upload file was fasta then the script upload_file.php->file_checker
        # created a '_clean' file that is convered back into a regular fasta file here
        # for mothur to unique
        file_to_trim = file_base+'/fasta_file.fa'
        fh = open(file_to_trim,'w')
        try:
            infile = file_base+'/seqfile.fa_clean'
            f = FastaCleanReader(infile)
        except:
            infile = file_base+'/seqfile_seq_clean'
            f = FastaCleanReader(infile)
        # create new fasta here for mothur to unique,names
        while f.next():
            read_id = f.id
            seq = f.seq
            #print read_id,seq
            fh.write('>'+read_id+"\n"+seq+"\n")
        
        fh.close()
        
        # get qual file if present
        # USES: clean qual file in trim_run.py
#         if os.path.exists( file_base + "/qualfile_qual_clean"):
#             qualfile_to_trim = file_base+'/fasta_file.qual'
#             fh = open(qualfile_to_trim,'w')
#             infile = file_base + "/qualfile_qual_clean"
#             f = FastaCleanReader(infile)
#             # create new fasta quality file here for trimming
#             while f.next():
#                 read_id = f.id
#                 seq = f.seq
#                 #print read_id,seq
#                 fh.write('>'+read_id+"\n"+seq+"\n")
#             
#             fh.close()
        
        # create unique and names file (for fasta file only)
        mothur_cmd = "/bioware/mothur/mothur \"#unique.seqs(fasta="+file_to_trim+"); \" ";

        subprocess.call(mothur_cmd, shell=True)
        if not os.path.exists( file_base+"/fasta_file.unique.fa" ):    
            print "Uniques fasta file: fasta_file.unique.fa, is not created. Exiting\n";
            os.exit()
    
    
        if not os.path.exists( file_base+"/fasta_file.names" ):   
            print "Names file: fasta_file.names, is not created. Exiting\n";
            os.exit()
    
        
    elif file_type[:5] == 'fastq':     
        infile = file_base+'/seqfile.fq' 
        file_to_trim = file_base+'/seqfile.fq'
    elif file_type == 'sff':
        infile = file_base+'/seqfile.sff'
        file_to_trim = file_base+'/seqfile.sff'
    else:
        logger.debug("vamps_trim.py : Input filetype ERROR "+file_type)
                
    

    ######### Create a Run here for the uploaded data ###############
    #
    # need to create a 'Run' here and then feed it to trim_run in the py pipeline
    # A run object emulates the ini file
    # and has an output directory, general section, rundate, and a list of samples.
    # A sample has direction,dna_region,taxonomic_domain,anchor,stop_sequences
    #
    ########################
    myRunDict = {}
    for r in metadata_obj:
        myRunDict[r] = {}    
    
    #lanekeys = [metadata_obj[key]['lanekey'] for key in metadata_obj]
    
    
    myRunDict['general'] = {'run_date':datetime,            'input_dir':file_base,
                            'platform':'vamps',             'require_distal':require_distal,
                            'input_file_formats':file_type, 'user':user,
                            'input_file_lane':'1',          'vamps_user_upload':True,
                            'gast_data_source':'database',  'minimumLength':minlength,
                            'maximumLength':maxlength,      'run':runcode,
                            'use_cluster':use_cluster,      'site':site,
                            'load_vamps_database':True,     'output_dir':file_base,
                            'input_files':file_to_trim,     'files_list':[file_to_trim]
                            }
    #'input_file_names':file_to_trim,
    f_primers=''
    r_primers=''
    
    #print myRunDict
    for p in primers_obj:        
        if primers_obj[p]['direction'] == 'F':
            f_primers += primers_obj[p]['sequence']+','
        elif primers_obj[p]['direction'] == 'R':
            r_primers += primers_obj[p]['sequence']+','
    f_primers = f_primers[:-1]  
    r_primers = r_primers[:-1] 
    for r in metadata_obj:
        
        #myRunDict[metadata_obj[r]['lanekey']]['data_owner'] = user
        # r       = 1_AGTC
        
        myRunDict[r]['forward_primers'] = f_primers
        myRunDict[r]['reverse_primers'] = r_primers
        myRunDict[r]['key'] = metadata_obj[r]['key']
        myRunDict[r]['direction'] = metadata_obj[r]['direction']
        myRunDict[r]['project'] = metadata_obj[r]['project']
        myRunDict[r]['dataset'] = metadata_obj[r]['dataset']
        myRunDict[r]['dna_region'] = metadata_obj[r]['dna_region']
        myRunDict[r]['taxonomic_domain'] = metadata_obj[r]['domain']
        myRunDict[r]['project_description'] = metadata_obj[r]['project_description']
        myRunDict[r]['project_title'] = metadata_obj[r]['project_title']
        myRunDict[r]['dataset_description'] = metadata_obj[r]['dataset_description']
        myRunDict[r]['env_sample_source_id'] = metadata_obj[r]['env_sample_source_id']
        
        # turn off looking for mbl primers as well as the uploaded oned
        myRunDict[r]['use_mbl_primers'] = '0'
        
	
    
    #run = Run(myRunDict, file_base, "/xraid2-2/vampsweb/"+site)
    #for i in myRunDict:
    #    print i,myRunDict[i]
    run = Run(myRunDict,  "/xraid2-2/vampsweb/"+site)
    # output_dir is created in run so add it to dict here
    #print 'samples',run.samples
    myRunDict['output_dir'] = run.output_dir
    #print myRunDict 
    #run = Run(args.configPath, args.baseoutputdirarg, os.path.dirname(os.path.realpath(__file__)))  
    #print 'OUT dir ',run.output_dir
    # now do all the work
    # steps: trim,chimera,gast,vampsupload
    steps = 'trim'
    process(run, steps)
    
    return myRunDict
Example #7
0
        # view CONFIG file contents
        fh = open(
            os.path.join(dirs.analysis_dir,
                         data_object['general']['run'] + '.ini'))
        lines = fh.readlines()
        logger.debug("\n=== START ===\n")
        for line in lines:
            line = line.strip()
            logger.debug("line in INI: ")
            logger.debug(line)
        logger.debug("==== END ====\n")
        sys.exit()
    elif answer != 'c':
        sys.exit()
    ##############
    #
    # CREATE THE RUN OBJECT (see runconfig.py for details)
    #
    ##############
    runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__)))

    #    for key in run.samples:
    #        print(key,run.samples[key].dataset)
    #    sys.exit()
    ##############
    #
    # now do all the work
    #
    ##############
    process(runobj, args.steps)
def start_gast(myobject):
    """
      Doc string
    """
    project     = myobject['project']
    dataset     = myobject['dataset']
    dna_region  = myobject['dna_region']
    domain      = myobject['domain']
    runcode     = myobject['runcode']
    site        = myobject['site']
    #user_cursor = myobject['user_cursor']
    datetime    = myobject['datetime']
    user        = myobject['user']
    from_fasta  = myobject['from_fasta']
    load_db     = myobject['load_db']
    env_source_id   = myobject['env_source_id'] 
    steps   = myobject['steps'] 
    fasta_file_from_cl  = myobject['fasta_file']
    use_cluster         = myobject['use_cluster']
    #myobject['baseoutputdir']
    seq_count   = 0
    site_base   = '/xraid2-2/vampsweb/'+site
    file_prefix = user+runcode
    
    output_dir = myobject['output_dir']
    #output_dir  = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast')
    
    # use the files from file_base directory
    # but we get the primers and keys from the database
    # which were stored there during the loading phase      

    # check for directory:  user_runcode
    #    if present use the data from there
    #    if not: go to the database
    if os.path.exists(output_dir):
        print "files path exists:",output_dir
        #gast_input_source = 'files'
        #file_base = output_dir
        # This may be a mobedac upload and we should try to use the files here
        # rather than look to the database for data

    else:
        output_dir  = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast')
        print "Files path doesn't exist: attempting to get data from database"
        print "Creating directory",output_dir
        os.mkdir(output_dir)
        
    from pipeline.run import Run
    from pipelineprocessor import process
    myRunDict = {}
    # this is a minimal run dictionary for the general stanza
    myRunDict['general'] = {'run_date':datetime,                'vamps_user_upload':True, 
                            'gast_input_source':'database',     'input_file_names':'vamps_upload', 
                            'input_file_lanes':'1',             'input_file_formats':'fasta',
                            'run':runcode,                      'use_cluster':use_cluster, 
                            'platform':'vamps', 
                            'user':user,                        'site':site,
                            'load_vamps_database':True,
                            'input_files':None,                 'files_list':[],
                            'output_dir':output_dir,            'file_prefix':file_prefix}
    #print myRunDict
    #
    #
    #
    run = Run(myRunDict, "/xraid2-2/vampsweb/"+site)
    #
    #
    #
    # pack the things we'll need for GAST
    run.project = project
    run.dataset = dataset
    run.load_db = load_db
    run.env_source_id=env_source_id
    run.site = site
    run.from_fasta = from_fasta
    run.fasta_file_from_cl=fasta_file_from_cl
    run.runcode = runcode
    run.user = user
    run.samples = {}
    run.dna_region = dna_region
    
    #run.basedir = file_base
#    fastaunique_cmd = '/bioware/bin/fastaunique'
    fastaunique_cmd = 'fastaunique'
    if run.from_fasta:
        print run.from_fasta
        # copy file to
        fasta_file = os.path.join(output_dir,run.user+run.runcode+'.fa')
        shutil.copyfile(run.fasta_file_from_cl, fasta_file)
        grep_cmd = ['grep','-c','>',fasta_file]
        run.dataset_count = subprocess.check_output(grep_cmd).strip()
        
    else:
        # from database
        from pipeline.db_upload import MyConnection
        if site == 'vamps':
            db_host_user    = '******'
            db_name_user    = '******'
        else:
            db_host_user    = '******'
            db_name_user    = '******'
        myconn = MyConnection(host=db_host_user,db=db_name_user)
        
        
        # should create the fasta file and names file here and not in gast.py 
        ds_list = []
        if dataset:
            ds_list = [dataset]
            query ="select read_id,sequence,dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+dataset+"' and user='******' "    
            print query
            rows = myconn.execute_fetch_select(query)
            fasta_file = os.path.join(output_dir, 'fasta.fa')
            unique_file = os.path.join(output_dir, 'unique.fa')
            names_file = os.path.join(output_dir, 'names')
    
            fh = open(fasta_file, 'w')
            
            if not rows:
                print "No data found using query:", query
                
            for r in rows:
                id  = r[0]
                seq = r[1]                   
                fh.write(">"+id+"\n"+seq+"\n")
                
            fh.close()
            fastaunique_cmd = fastaunique_cmd +" -x -i "+fasta_file+" -o "+unique_file+" -n "+names_file 
            subprocess.call(fastaunique_cmd, shell=True)
        
        else:
            # looks for vamps_projects_datasets_pipe in vamps_user_uploads
           
            q0 = "select distinct dataset from vamps_projects_datasets_pipe where project='"+project+"' and dataset != '' and dataset != 'NoKey'"
            print q0
            dsrows = myconn.execute_fetch_select(q0)
            if not dsrows:
                print "No datasets found using query:", q0
                sys.exit()
            for ds in dsrows:                
                ds  = ds[0]
                ds_list.append(ds)
                
                
                query ="select read_id, sequence, dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+ds+"' and user='******' "    
                print query
                rows = myconn.execute_fetch_select(query)
                
                
                ds_dir = os.path.join(output_dir, ds)
                if os.path.exists(ds_dir):
                    # Start with and empty directory
                    shutil.rmtree(ds_dir, True)
                    os.mkdir(ds_dir)
                else:
                    os.mkdir(ds_dir)
                fasta_file = os.path.join(output_dir, ds,  'fasta.fa')
                unique_file = os.path.join(output_dir, ds, 'unique.fa')
                names_file = os.path.join(output_dir,  ds, 'names')
                #dataset_file=os.path.join(output_dir,   'datasets')
                fh = open(fasta_file, 'w')
                
                if not rows:
                    print "No data found using query:", query
                    
                for r in rows:
                    id  = r[0]
                    seq = r[1]  
                    ds  = r[2]
                    fh.write(">"+id+"\n"+seq+"\n")
                    
                fh.close()
                
            
                fastaunique_call = fastaunique_cmd +" "+fasta_file+" -o "+unique_file+" -n "+names_file + " -f"
            
                subprocess.call(fastaunique_call, shell=True)
    run.datasets = ds_list
    
    
    ###############################################################
    # This starts the MBL GAST python pipeline at the GAST STEP
    #
    # now do all the work
    # possible steps: trim,chimera,gast,vampsupload
    
    process(run, steps)
    print "done with gast"
 def setUpReverse(self):
     config_dict = configDictionaryFromFile("test/data/trim_test_reverse.ini")
     self.run = Run(config_dict, self.BASE_OUTPUT)
     process(self.run,"trim")
     self.expected = self.get_expected_results('test/data/test_trim_reverse.results')
def start_gast(args):
    """
      Doc string
    """
    logging.info('CMD> '+' '.join(sys.argv))
    print 'CMD> ',sys.argv
    use_local_pipeline = False
    if args.site == 'vamps' or args.site == 'vampsdev':
        sys.path.append(os.path.join('/','groups','vampsweb','py_mbl_sequencing_pipeline'))
        from pipeline.run import Run
        from pipelineprocessor import process
        from pipeline.db_upload import MyConnection
        from pipeline.utils import Dirs, PipelneUtils
        use_cluster     = True
    else:
        sys.path.append(os.path.join(args.process_dir,'public','scripts'))
        from gast.run import Run
        from gast.pipelineprocessor import process
        use_cluster     = False
    
    platform        = 'new_vamps'
    runcode         = 'NONE'
    site            = 'new_vamps'
    load_db         = True
    steps           = 'gast,new_vamps'
    fasta_file_from_cl  = '' #args.fasta_file
    
    mobedac         = False # True or False
    gast_input_source = 'file'
    seq_count   = 0
    
    
    os.chdir(args.project_dir)
    info_load_infile = args.config
    if not os.path.isfile(info_load_infile):
        logging.info( "Could not find config file ("+info_load_infile+") **Exiting**")
        sys.exit()
   
    config = ConfigParser.ConfigParser()
    config.optionxform=str
    config.read(info_load_infile)
    general_config_items = {}
    # CL take precedence for domain and dna_region
    
    for name, value in  config.items('GENERAL'):
        #print '  %s = %s' % (name, value)  
        general_config_items[name] = value
        
    file_prefix = 'testing-fp'
    dir_prefix  = general_config_items['baseoutputdir']
            
    logging.info(   'FROM INI-->'      )  
    logging.info(   general_config_items) 
    logging.info(   '<<--FROM INI'    )
    #in utils.py: def __init__(self, is_user_upload, dir_prefix, platform, lane_name = '', site = ''):
    #dirs = Dirs(True, dir_prefix, platform, site = site) 
    if not os.path.exists(args.project_dir):
        sys.exit(args.project_dir+' not found')
    
    analysis_dir = os.path.join(args.project_dir,'analysis') 
    gast_dir = os.path.join(analysis_dir,'gast') 
    if not os.path.exists(analysis_dir) or not os.path.exists(gast_dir):  
        print 'Could not find analysis or gast directory'
        sys.exit(1)
    #global_gast_dir = dirs.check_dir(dirs.gast_dir) 
    
    logging.debug(analysis_dir)
   
    
    
    myRunDict = {}
    # this is a minimal run dictionary for the general stanza
    myRunDict['general'] = {'run_date':datetime,                 
                            'new_vamps_upload':   True,
                            'vamps_user_upload':    True,
                            'use64bit':             False,
                            'mobedac':              mobedac,
                            'gast_input_source':    gast_input_source,
                            'input_file_names':     'vamps_upload',
                            'input_file_lanes':     '1',
                            'input_file_formats':   'fasta',
                            'run':                  runcode,
                            'use_cluster':          use_cluster,
                            'platform':             'new_vamps',
                            'dna_region':           general_config_items['dna_region'],
                            'domain':               general_config_items['domain'],
                            'env_source_id':        general_config_items['env_source_id'],
                            'classifier':           args.classifier,
                            'user':                 general_config_items['owner'],
                            'site':                 args.site, 
                            'load_vamps_database':  load_db,
                            'use_full_length':      True,
                            'input_files':          None,
                            'files_list':           [],
                            'output_dir':           general_config_items['baseoutputdir'],
                            'file_prefix':          file_prefix,
                            'project':              general_config_items['project'],
                            #new_vamps::
                            'project_dir':          args.project_dir,
                            'node_db':              args.NODE_DATABASE,
                            'process_dir':          args.process_dir,
                            
                            'ref_db_dir':           args.ref_db_dir,
                            'config_file':          args.config
                            
                            
                        }
    
    
    print myRunDict
    
    #
    #
    #
    run = Run(myRunDict, general_config_items['baseoutputdir'])
    #sys.exit()
    #
    #
    #
    # pack the things we'll need for GAST
    #run.project = project
    #run.dataset = dataset
    run.load_db = load_db
    #run.env_source_id=env_source_id
    run.site = site
    run.fasta_file_from_cl=fasta_file_from_cl
    run.runcode = runcode
    
    run.samples = {}

        
    ds_list = []
    datasets_list = config.options('DATASETS')
    number_of_datasets = len(datasets_list)
    info_tax_file = os.path.join(general_config_items['baseoutputdir'],'INFO_CONFIG.ini')
    info_fh = open(info_tax_file,'w')
    logging.info( 'Writing to '+info_tax_file)
    info_fh.write("[GENERAL]\n")
    info_fh.write('project='+general_config_items['project']+"\n")
    info_fh.write("classifier=GAST\n")
    info_fh.write("status=gasting\n")
    info_fh.write('date='+datetime+"\n")
    info_fh.write('file_base='+general_config_items['baseoutputdir']+"\n")
    info_fh.write("has_tax=0\n")
    info_fh.write("sequence_counts=UNIQUE\n")
    
    info_fh.write("number_of_datasets="+str(number_of_datasets)+"\n")
    info_fh.write("owner="+general_config_items['owner']+"\n")
    info_fh.write("dna_region="+general_config_items['dna_region']+"\n")
    info_fh.write("domain="+general_config_items['domain']+"\n")
    info_fh.write("env_source_id="+general_config_items['env_source_id']+"\n")
    info_fh.write("public="+general_config_items['public']+"\n")
    info_fh.flush()
    total_uniques = 0
    datasets = {}
    for dataset in datasets_list:
        logging.info( "\nlooking for unique file for "+dataset)
        ds_dir = os.path.join(gast_dir, dataset)
        fasta_file  = os.path.join(ds_dir, 'seqfile.fa')
        unique_file = os.path.join(ds_dir, 'unique.fa')
        names_file  = os.path.join(ds_dir, 'names')
        if not os.path.exists(unique_file):  
            logging.debug('Could not find unique file '+unique_file)
        #fastcount_call = "grep '>' "+unique_file+" | wc -l"
        grep_cmd = ['grep', '-c', '>', unique_file]
        logging.debug( ' '.join(grep_cmd) )
        ds_unique_seq_count = subprocess.check_output(grep_cmd).strip()
        
        #ds_unique_seq_count = subprocess.check_output(fastcount_call, shell=True)
        total_uniques += int(ds_unique_seq_count)
        datasets[dataset]=ds_unique_seq_count
    info_fh.write("project_total_sequence_count="+general_config_items['project_sequence_count']+"\n")
    info_fh.write("project_unique_sequence_count="+str(total_uniques)+"\n")
    info_fh.write("\n[DATASETS]\n")
    for ds in datasets:
        info_fh.write(ds+"="+str(datasets[ds]))
    info_fh.flush()
    info_fh.close()
    
    # delete old config file:
    #os.remove(info_load_infile)
    #
    #logging.debug('DATASETS '+';'.join(datasets_list))
    run.datasets = datasets_list
    
    
    ###############################################################
    # This starts the MBL GAST python pipeline at the GAST STEP unless vampsupload only was passed as a step
    #
    # now do all the work
    # possible steps: trim,chimera,gast,vampsupload,new_vamps
    
    process(run, steps)
def start_gast(args):
    """
      Doc string
    """
    logging.info('CMD> ' + ' '.join(sys.argv))
    print 'CMD> ', sys.argv
    use_local_pipeline = False
    if args.site == 'vamps' or args.site == 'vampsdev':
        sys.path.append(
            os.path.join('/', 'groups', 'vampsweb',
                         'py_mbl_sequencing_pipeline'))
        from pipeline.run import Run
        from pipelineprocessor import process
        from pipeline.db_upload import MyConnection
        from pipeline.utils import Dirs, PipelneUtils
        use_cluster = True
    else:
        sys.path.append(os.path.join(args.process_dir, 'public', 'scripts'))
        from gast.run import Run
        from gast.pipelineprocessor import process
        use_cluster = False

    platform = 'new_vamps'
    runcode = 'NONE'
    site = 'new_vamps'
    load_db = True
    steps = 'gast,new_vamps'
    fasta_file_from_cl = ''  #args.fasta_file

    mobedac = False  # True or False
    gast_input_source = 'file'
    seq_count = 0

    os.chdir(args.project_dir)
    info_load_infile = args.config
    if not os.path.isfile(info_load_infile):
        logging.info("Could not find config file (" + info_load_infile +
                     ") **Exiting**")
        sys.exit()

    config = ConfigParser.ConfigParser()
    config.optionxform = str
    config.read(info_load_infile)
    general_config_items = {}
    # CL take precedence for domain and dna_region

    for name, value in config.items('GENERAL'):
        #print '  %s = %s' % (name, value)
        general_config_items[name] = value

    file_prefix = 'testing-fp'
    dir_prefix = general_config_items['baseoutputdir']

    logging.info('FROM INI-->')
    logging.info(general_config_items)
    logging.info('<<--FROM INI')
    #in utils.py: def __init__(self, is_user_upload, dir_prefix, platform, lane_name = '', site = ''):
    #dirs = Dirs(True, dir_prefix, platform, site = site)
    if not os.path.exists(args.project_dir):
        sys.exit(args.project_dir + ' not found')

    analysis_dir = os.path.join(args.project_dir, 'analysis')
    gast_dir = os.path.join(analysis_dir, 'gast')
    if not os.path.exists(analysis_dir) or not os.path.exists(gast_dir):
        print 'Could not find analysis or gast directory'
        sys.exit(1)
    #global_gast_dir = dirs.check_dir(dirs.gast_dir)

    logging.debug(analysis_dir)

    myRunDict = {}
    # this is a minimal run dictionary for the general stanza
    myRunDict['general'] = {
        'run_date': datetime,
        'new_vamps_upload': True,
        'vamps_user_upload': True,
        'use64bit': False,
        'mobedac': mobedac,
        'gast_input_source': gast_input_source,
        'input_file_names': 'vamps_upload',
        'input_file_lanes': '1',
        'input_file_formats': 'fasta',
        'run': runcode,
        'use_cluster': use_cluster,
        'platform': 'new_vamps',
        'dna_region': general_config_items['dna_region'],
        'domain': general_config_items['domain'],
        'env_source_id': general_config_items['env_source_id'],
        'classifier': args.classifier,
        'user': general_config_items['owner'],
        'site': args.site,
        'load_vamps_database': load_db,
        'use_full_length': True,
        'input_files': None,
        'files_list': [],
        'output_dir': general_config_items['baseoutputdir'],
        'file_prefix': file_prefix,
        'project': general_config_items['project'],
        #new_vamps::
        'project_dir': args.project_dir,
        'node_db': args.NODE_DATABASE,
        'process_dir': args.process_dir,
        'ref_db_dir': args.ref_db_dir,
        'config_file': args.config
    }

    print myRunDict

    #
    #
    #
    run = Run(myRunDict, general_config_items['baseoutputdir'])
    #sys.exit()
    #
    #
    #
    # pack the things we'll need for GAST
    #run.project = project
    #run.dataset = dataset
    run.load_db = load_db
    #run.env_source_id=env_source_id
    run.site = site
    run.fasta_file_from_cl = fasta_file_from_cl
    run.runcode = runcode

    run.samples = {}

    ds_list = []
    datasets_list = config.options('DATASETS')
    number_of_datasets = len(datasets_list)
    info_tax_file = os.path.join(general_config_items['baseoutputdir'],
                                 'INFO_CONFIG.ini')
    info_fh = open(info_tax_file, 'w')
    logging.info('Writing to ' + info_tax_file)
    info_fh.write("[GENERAL]\n")
    info_fh.write('project=' + general_config_items['project'] + "\n")
    info_fh.write("classifier=GAST\n")
    info_fh.write("status=gasting\n")
    info_fh.write('date=' + datetime + "\n")
    info_fh.write('file_base=' + general_config_items['baseoutputdir'] + "\n")
    info_fh.write("has_tax=0\n")
    info_fh.write("sequence_counts=UNIQUE\n")

    info_fh.write("number_of_datasets=" + str(number_of_datasets) + "\n")
    info_fh.write("owner=" + general_config_items['owner'] + "\n")
    info_fh.write("dna_region=" + general_config_items['dna_region'] + "\n")
    info_fh.write("domain=" + general_config_items['domain'] + "\n")
    info_fh.write("env_source_id=" + general_config_items['env_source_id'] +
                  "\n")
    info_fh.write("public=" + general_config_items['public'] + "\n")
    info_fh.flush()
    total_uniques = 0
    datasets = {}
    for dataset in datasets_list:
        logging.info("\nlooking for unique file for " + dataset)
        ds_dir = os.path.join(gast_dir, dataset)
        fasta_file = os.path.join(ds_dir, 'seqfile.fa')
        unique_file = os.path.join(ds_dir, 'unique.fa')
        names_file = os.path.join(ds_dir, 'names')
        if not os.path.exists(unique_file):
            logging.debug('Could not find unique file ' + unique_file)
        #fastcount_call = "grep '>' "+unique_file+" | wc -l"
        grep_cmd = ['grep', '-c', '>', unique_file]
        logging.debug(' '.join(grep_cmd))
        ds_unique_seq_count = subprocess.check_output(grep_cmd).strip()

        #ds_unique_seq_count = subprocess.check_output(fastcount_call, shell=True)
        total_uniques += int(ds_unique_seq_count)
        datasets[dataset] = ds_unique_seq_count
    info_fh.write("project_total_sequence_count=" +
                  general_config_items['project_sequence_count'] + "\n")
    info_fh.write("project_unique_sequence_count=" + str(total_uniques) + "\n")
    info_fh.write("\n[DATASETS]\n")
    for ds in datasets:
        info_fh.write(ds + "=" + str(datasets[ds]))
    info_fh.flush()
    info_fh.close()

    # delete old config file:
    #os.remove(info_load_infile)
    #
    #logging.debug('DATASETS '+';'.join(datasets_list))
    run.datasets = datasets_list

    ###############################################################
    # This starts the MBL GAST python pipeline at the GAST STEP unless vampsupload only was passed as a step
    #
    # now do all the work
    # possible steps: trim,chimera,gast,vampsupload,new_vamps

    process(run, steps)
import pipeline.constants as C

if __name__ == '__main__':
    THE_DEFAULT_BASE_OUTPUT = '.'

    usage = "usage: %prog [options] arg1 arg2"
    parser = argparse.ArgumentParser(description='MBL Sequence Pipeline')
    parser.add_argument('-c', '--configuration', required=True, dest = "configPath",
                                                 help = 'Configuration parameters of the run. See README File')
    parser.add_argument("-b", "--baseoutputdir",     required=False,  action="store",  default=THE_DEFAULT_BASE_OUTPUT, dest = "baseoutputdirarg", 
                                                help="Comma seperated list of steps.  Choices are: trim,chimera,gast,vampsupload,all")
    parser.add_argument("-s", "--steps",     required=True,  action="store",   dest = "steps", 
                                                help="Comma seperated list of steps.  Choices are: trim,chimera,gast,vampsupload,all")
    parser.add_argument('-l', '--loglevel',  required=False,   action="store",   default='ERROR', dest = "loglevel",        
                                                 help = 'Sets logging level...INFO, DEBUG, [ERROR]') 
    
    args = parser.parse_args() 
    # deal with logging level
    loggerlevel = logging.ERROR
    if args.loglevel.upper() == 'DEBUG':
        loggerlevel = logging.DEBUG
    elif  args.loglevel.upper() == 'INFO':     
        loggerlevel = logging.INFO
    logger.setLevel(loggerlevel)    
    # read the config file
    run = Run(args.configPath, args.baseoutputdirarg, os.path.dirname(os.path.realpath(__file__)))  

    # now do all the work
    process(run, args.steps)