def setUpReverse(self): self.configPath = "test/data/trim_test_reverse.ini" m=MetadataUtils(self) config_dict = m.create_dictionary_from_ini() self.run = Run(config_dict, self, self.baseoutputdir) process(self.run,"trim") self.expected = self.get_expected_results('test/data/test_trim_reverse.results')
def setUpReverse(self): self.configPath = "test/data/trim_test_reverse.ini" m = MetadataUtils(self) config_dict = m.create_dictionary_from_ini() self.run = Run(config_dict, self, self.baseoutputdir) process(self.run, "trim") self.expected = self.get_expected_results( 'test/data/test_trim_reverse.results')
def setUpForward(self): self.configPath = "test/data/trim_test_forward.ini" m=MetadataUtils(self) config_dict = m.create_dictionary_from_ini() print 'configDict',config_dict self.run = Run(config_dict, self, self.baseoutputdir) process(self.run,"trim") self.expected = self.get_expected_results('test/data/test_trim_forward.results')
def setUpForward(self): self.configPath = "test/data/trim_test_forward.ini" m = MetadataUtils(self) config_dict = m.create_dictionary_from_ini() print 'configDict', config_dict self.run = Run(config_dict, self, self.baseoutputdir) process(self.run, "trim") self.expected = self.get_expected_results( 'test/data/test_trim_forward.results')
# view CONFIG file contents fh = open(os.path.join(dirs.analysis_dir, data_object['general']['run']+'.ini')) lines = fh.readlines() logger.debug("\n=== START ===\n") for line in lines: line = line.strip() logger.debug("line in INI: ") logger.debug(line) logger.debug("==== END ====\n") sys.exit() elif answer != 'c': sys.exit() ############## # # CREATE THE RUN OBJECT (see runconfig.py for details) # ############## runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__))) # for key in run.samples: # print(key,run.samples[key].dataset) # sys.exit() ############## # # now do all the work # ############## process(runobj, args.steps)
def trim_file(myobject): """ Doc string """ require_distal = myobject['require_distal'] minlength = myobject['minlength'] maxlength = myobject['maxlength'] user = myobject['user'] runcode = myobject['runcode'] site = myobject['site'] file_type = myobject['file_type'] file_base = myobject['file_base'] datetime = myobject['datetime'] use_cluster = myobject['use_cluster'] primers_obj = get_primers(file_base) metadata_obj = get_metadata(file_base) # use the files from file_base directory # but we get the primers and keys from the database # which were stored there during the loading phase if file_type == 'fasta' or file_type == 'fasta_clean': # if upload file was fasta then the script upload_file.php->file_checker # created a '_clean' file that is convered back into a regular fasta file here # for mothur to unique file_to_trim = file_base+'/fasta_file.fa' fh = open(file_to_trim,'w') try: infile = file_base+'/seqfile.fa_clean' f = FastaCleanReader(infile) except: infile = file_base+'/seqfile_seq_clean' f = FastaCleanReader(infile) # create new fasta here for mothur to unique,names while f.next(): read_id = f.id seq = f.seq #print read_id,seq fh.write('>'+read_id+"\n"+seq+"\n") fh.close() # get qual file if present # USES: clean qual file in trim_run.py # if os.path.exists( file_base + "/qualfile_qual_clean"): # qualfile_to_trim = file_base+'/fasta_file.qual' # fh = open(qualfile_to_trim,'w') # infile = file_base + "/qualfile_qual_clean" # f = FastaCleanReader(infile) # # create new fasta quality file here for trimming # while f.next(): # read_id = f.id # seq = f.seq # #print read_id,seq # fh.write('>'+read_id+"\n"+seq+"\n") # # fh.close() # create unique and names file (for fasta file only) mothur_cmd = "/bioware/mothur/mothur \"#unique.seqs(fasta="+file_to_trim+"); \" "; subprocess.call(mothur_cmd, shell=True) if not os.path.exists( file_base+"/fasta_file.unique.fa" ): print "Uniques fasta file: fasta_file.unique.fa, is not created. Exiting\n"; os.exit() if not os.path.exists( file_base+"/fasta_file.names" ): print "Names file: fasta_file.names, is not created. Exiting\n"; os.exit() elif file_type[:5] == 'fastq': infile = file_base+'/seqfile.fq' file_to_trim = file_base+'/seqfile.fq' elif file_type == 'sff': infile = file_base+'/seqfile.sff' file_to_trim = file_base+'/seqfile.sff' else: logger.debug("vamps_trim.py : Input filetype ERROR "+file_type) ######### Create a Run here for the uploaded data ############### # # need to create a 'Run' here and then feed it to trim_run in the py pipeline # A run object emulates the ini file # and has an output directory, general section, rundate, and a list of samples. # A sample has direction,dna_region,taxonomic_domain,anchor,stop_sequences # ######################## myRunDict = {} for r in metadata_obj: myRunDict[r] = {} #lanekeys = [metadata_obj[key]['lanekey'] for key in metadata_obj] myRunDict['general'] = {'run_date':datetime, 'input_dir':file_base, 'platform':'vamps', 'require_distal':require_distal, 'input_file_formats':file_type, 'user':user, 'input_file_lane':'1', 'vamps_user_upload':True, 'gast_data_source':'database', 'minimumLength':minlength, 'maximumLength':maxlength, 'run':runcode, 'use_cluster':use_cluster, 'site':site, 'load_vamps_database':True, 'output_dir':file_base, 'input_files':file_to_trim, 'files_list':[file_to_trim] } #'input_file_names':file_to_trim, f_primers='' r_primers='' #print myRunDict for p in primers_obj: if primers_obj[p]['direction'] == 'F': f_primers += primers_obj[p]['sequence']+',' elif primers_obj[p]['direction'] == 'R': r_primers += primers_obj[p]['sequence']+',' f_primers = f_primers[:-1] r_primers = r_primers[:-1] for r in metadata_obj: #myRunDict[metadata_obj[r]['lanekey']]['data_owner'] = user # r = 1_AGTC myRunDict[r]['forward_primers'] = f_primers myRunDict[r]['reverse_primers'] = r_primers myRunDict[r]['key'] = metadata_obj[r]['key'] myRunDict[r]['direction'] = metadata_obj[r]['direction'] myRunDict[r]['project'] = metadata_obj[r]['project'] myRunDict[r]['dataset'] = metadata_obj[r]['dataset'] myRunDict[r]['dna_region'] = metadata_obj[r]['dna_region'] myRunDict[r]['taxonomic_domain'] = metadata_obj[r]['domain'] myRunDict[r]['project_description'] = metadata_obj[r]['project_description'] myRunDict[r]['project_title'] = metadata_obj[r]['project_title'] myRunDict[r]['dataset_description'] = metadata_obj[r]['dataset_description'] myRunDict[r]['env_sample_source_id'] = metadata_obj[r]['env_sample_source_id'] # turn off looking for mbl primers as well as the uploaded oned myRunDict[r]['use_mbl_primers'] = '0' #run = Run(myRunDict, file_base, "/xraid2-2/vampsweb/"+site) #for i in myRunDict: # print i,myRunDict[i] run = Run(myRunDict, "/xraid2-2/vampsweb/"+site) # output_dir is created in run so add it to dict here #print 'samples',run.samples myRunDict['output_dir'] = run.output_dir #print myRunDict #run = Run(args.configPath, args.baseoutputdirarg, os.path.dirname(os.path.realpath(__file__))) #print 'OUT dir ',run.output_dir # now do all the work # steps: trim,chimera,gast,vampsupload steps = 'trim' process(run, steps) return myRunDict
# view CONFIG file contents fh = open( os.path.join(dirs.analysis_dir, data_object['general']['run'] + '.ini')) lines = fh.readlines() logger.debug("\n=== START ===\n") for line in lines: line = line.strip() logger.debug("line in INI: ") logger.debug(line) logger.debug("==== END ====\n") sys.exit() elif answer != 'c': sys.exit() ############## # # CREATE THE RUN OBJECT (see runconfig.py for details) # ############## runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__))) # for key in run.samples: # print(key,run.samples[key].dataset) # sys.exit() ############## # # now do all the work # ############## process(runobj, args.steps)
def start_gast(myobject): """ Doc string """ project = myobject['project'] dataset = myobject['dataset'] dna_region = myobject['dna_region'] domain = myobject['domain'] runcode = myobject['runcode'] site = myobject['site'] #user_cursor = myobject['user_cursor'] datetime = myobject['datetime'] user = myobject['user'] from_fasta = myobject['from_fasta'] load_db = myobject['load_db'] env_source_id = myobject['env_source_id'] steps = myobject['steps'] fasta_file_from_cl = myobject['fasta_file'] use_cluster = myobject['use_cluster'] #myobject['baseoutputdir'] seq_count = 0 site_base = '/xraid2-2/vampsweb/'+site file_prefix = user+runcode output_dir = myobject['output_dir'] #output_dir = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast') # use the files from file_base directory # but we get the primers and keys from the database # which were stored there during the loading phase # check for directory: user_runcode # if present use the data from there # if not: go to the database if os.path.exists(output_dir): print "files path exists:",output_dir #gast_input_source = 'files' #file_base = output_dir # This may be a mobedac upload and we should try to use the files here # rather than look to the database for data else: output_dir = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast') print "Files path doesn't exist: attempting to get data from database" print "Creating directory",output_dir os.mkdir(output_dir) from pipeline.run import Run from pipelineprocessor import process myRunDict = {} # this is a minimal run dictionary for the general stanza myRunDict['general'] = {'run_date':datetime, 'vamps_user_upload':True, 'gast_input_source':'database', 'input_file_names':'vamps_upload', 'input_file_lanes':'1', 'input_file_formats':'fasta', 'run':runcode, 'use_cluster':use_cluster, 'platform':'vamps', 'user':user, 'site':site, 'load_vamps_database':True, 'input_files':None, 'files_list':[], 'output_dir':output_dir, 'file_prefix':file_prefix} #print myRunDict # # # run = Run(myRunDict, "/xraid2-2/vampsweb/"+site) # # # # pack the things we'll need for GAST run.project = project run.dataset = dataset run.load_db = load_db run.env_source_id=env_source_id run.site = site run.from_fasta = from_fasta run.fasta_file_from_cl=fasta_file_from_cl run.runcode = runcode run.user = user run.samples = {} run.dna_region = dna_region #run.basedir = file_base # fastaunique_cmd = '/bioware/bin/fastaunique' fastaunique_cmd = 'fastaunique' if run.from_fasta: print run.from_fasta # copy file to fasta_file = os.path.join(output_dir,run.user+run.runcode+'.fa') shutil.copyfile(run.fasta_file_from_cl, fasta_file) grep_cmd = ['grep','-c','>',fasta_file] run.dataset_count = subprocess.check_output(grep_cmd).strip() else: # from database from pipeline.db_upload import MyConnection if site == 'vamps': db_host_user = '******' db_name_user = '******' else: db_host_user = '******' db_name_user = '******' myconn = MyConnection(host=db_host_user,db=db_name_user) # should create the fasta file and names file here and not in gast.py ds_list = [] if dataset: ds_list = [dataset] query ="select read_id,sequence,dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+dataset+"' and user='******' " print query rows = myconn.execute_fetch_select(query) fasta_file = os.path.join(output_dir, 'fasta.fa') unique_file = os.path.join(output_dir, 'unique.fa') names_file = os.path.join(output_dir, 'names') fh = open(fasta_file, 'w') if not rows: print "No data found using query:", query for r in rows: id = r[0] seq = r[1] fh.write(">"+id+"\n"+seq+"\n") fh.close() fastaunique_cmd = fastaunique_cmd +" -x -i "+fasta_file+" -o "+unique_file+" -n "+names_file subprocess.call(fastaunique_cmd, shell=True) else: # looks for vamps_projects_datasets_pipe in vamps_user_uploads q0 = "select distinct dataset from vamps_projects_datasets_pipe where project='"+project+"' and dataset != '' and dataset != 'NoKey'" print q0 dsrows = myconn.execute_fetch_select(q0) if not dsrows: print "No datasets found using query:", q0 sys.exit() for ds in dsrows: ds = ds[0] ds_list.append(ds) query ="select read_id, sequence, dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+ds+"' and user='******' " print query rows = myconn.execute_fetch_select(query) ds_dir = os.path.join(output_dir, ds) if os.path.exists(ds_dir): # Start with and empty directory shutil.rmtree(ds_dir, True) os.mkdir(ds_dir) else: os.mkdir(ds_dir) fasta_file = os.path.join(output_dir, ds, 'fasta.fa') unique_file = os.path.join(output_dir, ds, 'unique.fa') names_file = os.path.join(output_dir, ds, 'names') #dataset_file=os.path.join(output_dir, 'datasets') fh = open(fasta_file, 'w') if not rows: print "No data found using query:", query for r in rows: id = r[0] seq = r[1] ds = r[2] fh.write(">"+id+"\n"+seq+"\n") fh.close() fastaunique_call = fastaunique_cmd +" "+fasta_file+" -o "+unique_file+" -n "+names_file + " -f" subprocess.call(fastaunique_call, shell=True) run.datasets = ds_list ############################################################### # This starts the MBL GAST python pipeline at the GAST STEP # # now do all the work # possible steps: trim,chimera,gast,vampsupload process(run, steps) print "done with gast"
def setUpReverse(self): config_dict = configDictionaryFromFile("test/data/trim_test_reverse.ini") self.run = Run(config_dict, self.BASE_OUTPUT) process(self.run,"trim") self.expected = self.get_expected_results('test/data/test_trim_reverse.results')
def start_gast(args): """ Doc string """ logging.info('CMD> '+' '.join(sys.argv)) print 'CMD> ',sys.argv use_local_pipeline = False if args.site == 'vamps' or args.site == 'vampsdev': sys.path.append(os.path.join('/','groups','vampsweb','py_mbl_sequencing_pipeline')) from pipeline.run import Run from pipelineprocessor import process from pipeline.db_upload import MyConnection from pipeline.utils import Dirs, PipelneUtils use_cluster = True else: sys.path.append(os.path.join(args.process_dir,'public','scripts')) from gast.run import Run from gast.pipelineprocessor import process use_cluster = False platform = 'new_vamps' runcode = 'NONE' site = 'new_vamps' load_db = True steps = 'gast,new_vamps' fasta_file_from_cl = '' #args.fasta_file mobedac = False # True or False gast_input_source = 'file' seq_count = 0 os.chdir(args.project_dir) info_load_infile = args.config if not os.path.isfile(info_load_infile): logging.info( "Could not find config file ("+info_load_infile+") **Exiting**") sys.exit() config = ConfigParser.ConfigParser() config.optionxform=str config.read(info_load_infile) general_config_items = {} # CL take precedence for domain and dna_region for name, value in config.items('GENERAL'): #print ' %s = %s' % (name, value) general_config_items[name] = value file_prefix = 'testing-fp' dir_prefix = general_config_items['baseoutputdir'] logging.info( 'FROM INI-->' ) logging.info( general_config_items) logging.info( '<<--FROM INI' ) #in utils.py: def __init__(self, is_user_upload, dir_prefix, platform, lane_name = '', site = ''): #dirs = Dirs(True, dir_prefix, platform, site = site) if not os.path.exists(args.project_dir): sys.exit(args.project_dir+' not found') analysis_dir = os.path.join(args.project_dir,'analysis') gast_dir = os.path.join(analysis_dir,'gast') if not os.path.exists(analysis_dir) or not os.path.exists(gast_dir): print 'Could not find analysis or gast directory' sys.exit(1) #global_gast_dir = dirs.check_dir(dirs.gast_dir) logging.debug(analysis_dir) myRunDict = {} # this is a minimal run dictionary for the general stanza myRunDict['general'] = {'run_date':datetime, 'new_vamps_upload': True, 'vamps_user_upload': True, 'use64bit': False, 'mobedac': mobedac, 'gast_input_source': gast_input_source, 'input_file_names': 'vamps_upload', 'input_file_lanes': '1', 'input_file_formats': 'fasta', 'run': runcode, 'use_cluster': use_cluster, 'platform': 'new_vamps', 'dna_region': general_config_items['dna_region'], 'domain': general_config_items['domain'], 'env_source_id': general_config_items['env_source_id'], 'classifier': args.classifier, 'user': general_config_items['owner'], 'site': args.site, 'load_vamps_database': load_db, 'use_full_length': True, 'input_files': None, 'files_list': [], 'output_dir': general_config_items['baseoutputdir'], 'file_prefix': file_prefix, 'project': general_config_items['project'], #new_vamps:: 'project_dir': args.project_dir, 'node_db': args.NODE_DATABASE, 'process_dir': args.process_dir, 'ref_db_dir': args.ref_db_dir, 'config_file': args.config } print myRunDict # # # run = Run(myRunDict, general_config_items['baseoutputdir']) #sys.exit() # # # # pack the things we'll need for GAST #run.project = project #run.dataset = dataset run.load_db = load_db #run.env_source_id=env_source_id run.site = site run.fasta_file_from_cl=fasta_file_from_cl run.runcode = runcode run.samples = {} ds_list = [] datasets_list = config.options('DATASETS') number_of_datasets = len(datasets_list) info_tax_file = os.path.join(general_config_items['baseoutputdir'],'INFO_CONFIG.ini') info_fh = open(info_tax_file,'w') logging.info( 'Writing to '+info_tax_file) info_fh.write("[GENERAL]\n") info_fh.write('project='+general_config_items['project']+"\n") info_fh.write("classifier=GAST\n") info_fh.write("status=gasting\n") info_fh.write('date='+datetime+"\n") info_fh.write('file_base='+general_config_items['baseoutputdir']+"\n") info_fh.write("has_tax=0\n") info_fh.write("sequence_counts=UNIQUE\n") info_fh.write("number_of_datasets="+str(number_of_datasets)+"\n") info_fh.write("owner="+general_config_items['owner']+"\n") info_fh.write("dna_region="+general_config_items['dna_region']+"\n") info_fh.write("domain="+general_config_items['domain']+"\n") info_fh.write("env_source_id="+general_config_items['env_source_id']+"\n") info_fh.write("public="+general_config_items['public']+"\n") info_fh.flush() total_uniques = 0 datasets = {} for dataset in datasets_list: logging.info( "\nlooking for unique file for "+dataset) ds_dir = os.path.join(gast_dir, dataset) fasta_file = os.path.join(ds_dir, 'seqfile.fa') unique_file = os.path.join(ds_dir, 'unique.fa') names_file = os.path.join(ds_dir, 'names') if not os.path.exists(unique_file): logging.debug('Could not find unique file '+unique_file) #fastcount_call = "grep '>' "+unique_file+" | wc -l" grep_cmd = ['grep', '-c', '>', unique_file] logging.debug( ' '.join(grep_cmd) ) ds_unique_seq_count = subprocess.check_output(grep_cmd).strip() #ds_unique_seq_count = subprocess.check_output(fastcount_call, shell=True) total_uniques += int(ds_unique_seq_count) datasets[dataset]=ds_unique_seq_count info_fh.write("project_total_sequence_count="+general_config_items['project_sequence_count']+"\n") info_fh.write("project_unique_sequence_count="+str(total_uniques)+"\n") info_fh.write("\n[DATASETS]\n") for ds in datasets: info_fh.write(ds+"="+str(datasets[ds])) info_fh.flush() info_fh.close() # delete old config file: #os.remove(info_load_infile) # #logging.debug('DATASETS '+';'.join(datasets_list)) run.datasets = datasets_list ############################################################### # This starts the MBL GAST python pipeline at the GAST STEP unless vampsupload only was passed as a step # # now do all the work # possible steps: trim,chimera,gast,vampsupload,new_vamps process(run, steps)
def start_gast(args): """ Doc string """ logging.info('CMD> ' + ' '.join(sys.argv)) print 'CMD> ', sys.argv use_local_pipeline = False if args.site == 'vamps' or args.site == 'vampsdev': sys.path.append( os.path.join('/', 'groups', 'vampsweb', 'py_mbl_sequencing_pipeline')) from pipeline.run import Run from pipelineprocessor import process from pipeline.db_upload import MyConnection from pipeline.utils import Dirs, PipelneUtils use_cluster = True else: sys.path.append(os.path.join(args.process_dir, 'public', 'scripts')) from gast.run import Run from gast.pipelineprocessor import process use_cluster = False platform = 'new_vamps' runcode = 'NONE' site = 'new_vamps' load_db = True steps = 'gast,new_vamps' fasta_file_from_cl = '' #args.fasta_file mobedac = False # True or False gast_input_source = 'file' seq_count = 0 os.chdir(args.project_dir) info_load_infile = args.config if not os.path.isfile(info_load_infile): logging.info("Could not find config file (" + info_load_infile + ") **Exiting**") sys.exit() config = ConfigParser.ConfigParser() config.optionxform = str config.read(info_load_infile) general_config_items = {} # CL take precedence for domain and dna_region for name, value in config.items('GENERAL'): #print ' %s = %s' % (name, value) general_config_items[name] = value file_prefix = 'testing-fp' dir_prefix = general_config_items['baseoutputdir'] logging.info('FROM INI-->') logging.info(general_config_items) logging.info('<<--FROM INI') #in utils.py: def __init__(self, is_user_upload, dir_prefix, platform, lane_name = '', site = ''): #dirs = Dirs(True, dir_prefix, platform, site = site) if not os.path.exists(args.project_dir): sys.exit(args.project_dir + ' not found') analysis_dir = os.path.join(args.project_dir, 'analysis') gast_dir = os.path.join(analysis_dir, 'gast') if not os.path.exists(analysis_dir) or not os.path.exists(gast_dir): print 'Could not find analysis or gast directory' sys.exit(1) #global_gast_dir = dirs.check_dir(dirs.gast_dir) logging.debug(analysis_dir) myRunDict = {} # this is a minimal run dictionary for the general stanza myRunDict['general'] = { 'run_date': datetime, 'new_vamps_upload': True, 'vamps_user_upload': True, 'use64bit': False, 'mobedac': mobedac, 'gast_input_source': gast_input_source, 'input_file_names': 'vamps_upload', 'input_file_lanes': '1', 'input_file_formats': 'fasta', 'run': runcode, 'use_cluster': use_cluster, 'platform': 'new_vamps', 'dna_region': general_config_items['dna_region'], 'domain': general_config_items['domain'], 'env_source_id': general_config_items['env_source_id'], 'classifier': args.classifier, 'user': general_config_items['owner'], 'site': args.site, 'load_vamps_database': load_db, 'use_full_length': True, 'input_files': None, 'files_list': [], 'output_dir': general_config_items['baseoutputdir'], 'file_prefix': file_prefix, 'project': general_config_items['project'], #new_vamps:: 'project_dir': args.project_dir, 'node_db': args.NODE_DATABASE, 'process_dir': args.process_dir, 'ref_db_dir': args.ref_db_dir, 'config_file': args.config } print myRunDict # # # run = Run(myRunDict, general_config_items['baseoutputdir']) #sys.exit() # # # # pack the things we'll need for GAST #run.project = project #run.dataset = dataset run.load_db = load_db #run.env_source_id=env_source_id run.site = site run.fasta_file_from_cl = fasta_file_from_cl run.runcode = runcode run.samples = {} ds_list = [] datasets_list = config.options('DATASETS') number_of_datasets = len(datasets_list) info_tax_file = os.path.join(general_config_items['baseoutputdir'], 'INFO_CONFIG.ini') info_fh = open(info_tax_file, 'w') logging.info('Writing to ' + info_tax_file) info_fh.write("[GENERAL]\n") info_fh.write('project=' + general_config_items['project'] + "\n") info_fh.write("classifier=GAST\n") info_fh.write("status=gasting\n") info_fh.write('date=' + datetime + "\n") info_fh.write('file_base=' + general_config_items['baseoutputdir'] + "\n") info_fh.write("has_tax=0\n") info_fh.write("sequence_counts=UNIQUE\n") info_fh.write("number_of_datasets=" + str(number_of_datasets) + "\n") info_fh.write("owner=" + general_config_items['owner'] + "\n") info_fh.write("dna_region=" + general_config_items['dna_region'] + "\n") info_fh.write("domain=" + general_config_items['domain'] + "\n") info_fh.write("env_source_id=" + general_config_items['env_source_id'] + "\n") info_fh.write("public=" + general_config_items['public'] + "\n") info_fh.flush() total_uniques = 0 datasets = {} for dataset in datasets_list: logging.info("\nlooking for unique file for " + dataset) ds_dir = os.path.join(gast_dir, dataset) fasta_file = os.path.join(ds_dir, 'seqfile.fa') unique_file = os.path.join(ds_dir, 'unique.fa') names_file = os.path.join(ds_dir, 'names') if not os.path.exists(unique_file): logging.debug('Could not find unique file ' + unique_file) #fastcount_call = "grep '>' "+unique_file+" | wc -l" grep_cmd = ['grep', '-c', '>', unique_file] logging.debug(' '.join(grep_cmd)) ds_unique_seq_count = subprocess.check_output(grep_cmd).strip() #ds_unique_seq_count = subprocess.check_output(fastcount_call, shell=True) total_uniques += int(ds_unique_seq_count) datasets[dataset] = ds_unique_seq_count info_fh.write("project_total_sequence_count=" + general_config_items['project_sequence_count'] + "\n") info_fh.write("project_unique_sequence_count=" + str(total_uniques) + "\n") info_fh.write("\n[DATASETS]\n") for ds in datasets: info_fh.write(ds + "=" + str(datasets[ds])) info_fh.flush() info_fh.close() # delete old config file: #os.remove(info_load_infile) # #logging.debug('DATASETS '+';'.join(datasets_list)) run.datasets = datasets_list ############################################################### # This starts the MBL GAST python pipeline at the GAST STEP unless vampsupload only was passed as a step # # now do all the work # possible steps: trim,chimera,gast,vampsupload,new_vamps process(run, steps)
import pipeline.constants as C if __name__ == '__main__': THE_DEFAULT_BASE_OUTPUT = '.' usage = "usage: %prog [options] arg1 arg2" parser = argparse.ArgumentParser(description='MBL Sequence Pipeline') parser.add_argument('-c', '--configuration', required=True, dest = "configPath", help = 'Configuration parameters of the run. See README File') parser.add_argument("-b", "--baseoutputdir", required=False, action="store", default=THE_DEFAULT_BASE_OUTPUT, dest = "baseoutputdirarg", help="Comma seperated list of steps. Choices are: trim,chimera,gast,vampsupload,all") parser.add_argument("-s", "--steps", required=True, action="store", dest = "steps", help="Comma seperated list of steps. Choices are: trim,chimera,gast,vampsupload,all") parser.add_argument('-l', '--loglevel', required=False, action="store", default='ERROR', dest = "loglevel", help = 'Sets logging level...INFO, DEBUG, [ERROR]') args = parser.parse_args() # deal with logging level loggerlevel = logging.ERROR if args.loglevel.upper() == 'DEBUG': loggerlevel = logging.DEBUG elif args.loglevel.upper() == 'INFO': loggerlevel = logging.INFO logger.setLevel(loggerlevel) # read the config file run = Run(args.configPath, args.baseoutputdirarg, os.path.dirname(os.path.realpath(__file__))) # now do all the work process(run, args.steps)