def projects(self, key, dataset_count, file_collector):
     """
     fill vamps_projects_datasets.txt file
     """
     logger.info("Starting vamps_upload: projects_datasets")
     if self.runobj.vamps_user_upload:
         project = self.runobj.project
         dataset = key
     else:
         if self.runobj.platform == 'illumina':
             project = self.runobj.samples[key].project
             dataset = self.runobj.samples[key].dataset
         elif self.runobj.platform == '454':
             pass
         else:
             pass
     project = project[0].capitalize() + project[1:]
     project_dataset = project+'--'+dataset
     date_trimmed = 'unknown'
     dataset_description = dataset
     dataset_count = str(dataset_count)
     has_tax = '1' # true
     fh = open(file_collector['projects_datasets_file'],'w')
     
     fh.write("\t".join(["HEADER","project","dataset","dataset_count","has_tax", "date_trimmed","dataset_info"] )+"\n")
     fh.write("\t"+"\t".join([project, dataset, dataset_count, has_tax, date_trimmed, dataset_description] )+"\n")
     
     fh.close()
     logger.info("Finishing VAMPS projects()")
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                print "Problems with this command: %s" % (uchime_cmd)
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    raise                  
 def get_reference_databases(self,dna_region):
     
     #if dna region == v6v4(a) change it to v4v6
     # other reverse regions? 
     if dna_region == 'v6v4':
         dna_region = 'v4v6'
     if dna_region == 'v6v4a':
         dna_region = 'v4v6a'
     if C.use_full_length:
         if os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')):
             refdb = os.path.join(self.refdb_dir, 'refssu.udb')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')):
             refdb = os.path.join(self.refdb_dir, 'refssu.fa')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
     else:
         if os.path.exists(os.path.join(self.refdb_dir, C.refdbs[dna_region])):
             refdb = os.path.join(self.refdb_dir, C.refdbs[dna_region])
             taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa')):
             refdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa')
             taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')):
             refdb = os.path.join(self.refdb_dir, 'refssu.udb')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
         elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')):
             refdb = os.path.join(self.refdb_dir, 'refssu.fa')
             taxdb = os.path.join(self.refdb_dir, 'refssu.tax')
         else:
             logger.error("Could not find reference database "+refdb+" Exiting")
             sys.exit()  
     
     logger.info('tax_file '+taxdb)
     logger.info('ref_file '+refdb)        
     return (refdb,taxdb)   
 def info(self, lane_keys):
     """
     fill vamps_project_info table
     """
     logger.info("Starting vamps_upload: projects_info")
     
     if self.runobj.site == 'vamps':
         db_host    = 'vampsdb'
         db_name    = 'vamps'
     else:
         db_host    = 'vampsdev'
         db_name    = 'vamps'
     myconn = MyConnection(host=db_host, db=db_name)
     query = "SELECT last_name,first_name,email,institution from vamps_auth where user='******'" % (self.runobj.user)
     data = myconn.execute_fetch_select(query)
     
     fh = open(self.projects_info_file,'w')
      
     title="title"
     description='description'
     contact= data[0][1]+' '+data[0][0]
     email= data[0][2]
     institution= data[0][3]
     user = self.runobj.user
     fh.write("\t".join(["HEADER","project","title","description","contact", "email","institution","user","env_source_id"] )+"\n")
     fh.write("\t".join(["0",self.project, title, description, contact, email, institution, user, self.runobj.env_source_id] )+"\n")
     # if this project already exists in the db???
     # the next step should update the table rather than add new to the db
     
     fh.close()
     logger.info("Finishing VAMPS info()")
 def exports(self, lane_keys):
     """
     fill vamps_exports table
     """
     logger.info("Starting vamps_upload: exports")
     print "TODO: upload_vamps 5- exports"
     logger.info("Finishing VAMPS exports()")
Exemple #6
0
def new_vamps(runobj):
    """

    """
    logger.info("STARTING NEW_VAMPS()")
    idx_keys = get_keys(runobj)
    myvamps = Vamps(runobj, idx_keys)
    myvamps.create_vamps_files()
Exemple #7
0
def new_vamps(runobj):
    """
    
    """
    logger.info("STARTING NEW_VAMPS()")
    idx_keys = get_keys(runobj)
    myvamps = Vamps(runobj, idx_keys)
    myvamps.create_vamps_files()
 def chimera_reference(self,lane_keys):
 
     chimera_region_found = False
     output = {}
     cluster_id_list = []
     for lane_key in lane_keys:
         
         dna_region  = self.run.samples[lane_key].dna_region
         if dna_region in C.regions_to_chimera_check:
             chimera_region_found = True
         else:
             logger.debug('region not checked: ' + dna_region)                    
             continue
         
         out_fileName = self.prefix[lane_key] + ".chimeras.db"      
         
         # which ref db to use?
         ref_db = ''
         if dna_region.upper() == 'ITS':
             logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
             ref_db = self.its_refdb
         else:
             logger.debug("using standard refdb: " + self.refdb)
             ref_db = self.refdb
             
         uchime_cmd = ["clusterize"]
         uchime_cmd.append(self.usearch_cmd)
         uchime_cmd.append("--uchime")
         uchime_cmd.append(self.files[lane_key]['abund'])
         uchime_cmd.append("--uchimeout")
         uchime_cmd.append(out_fileName)
         uchime_cmd.append("--db")
         uchime_cmd.append(ref_db)
         
         
         try:
             logger.info("chimera referenc command: " + str(uchime_cmd))
             output[lane_key] = subprocess.check_output(uchime_cmd)
             #print 'outsplit',output[lane_key].split()[2]
             cluster_id_list.append(output[lane_key].split()[2])
             #print 'Have %d bytes in output' % len(output)
             #print 'ref',lane_key,output,len(output)
             if len(output[lane_key]) < 50 and len(output[lane_key]) > 40:
                 logger.debug(lane_key + " uchime ref seems to have been submitted successfully")                    
             else:
                 print >>sys.stderr, "uchime ref may be broke"
            
         except OSError, e:
             print >>sys.stderr, "Execution failed:", e 
 def projects(self, lane_keys):
     """
     fill vamps_projects_datasets table
     """
     logger.info("Starting vamps_upload: projects_datasets")
     date_trimmed = 'unknown'
     dataset_description = self.dataset
     dataset_count = str(self.dataset_count)
     has_tax = '1' # true
     fh = open(self.projects_datasets_file,'w')
     
     fh.write("\t".join(["HEADER","project","dataset","dataset_count","has_tax", "date_trimmed","dataset_info"] )+"\n")
     fh.write("\t".join(["0", self.project, self.dataset, dataset_count, has_tax, date_trimmed, dataset_description] )+"\n")
     
     fh.close()
     logger.info("Finishing VAMPS projects()")
 def info(self, key, file_collector):
     """
     fill vamps_project_info.txt file
     """
     logger.info("Starting vamps_upload: projects_info")
     print "Starting vamps_upload: projects_info"
     if self.runobj.vamps_user_upload:
         user = self.runobj.user
         project = self.runobj.project
         sample_source_id = self.runobj.env_source_id
     else:
         if self.runobj.platform == 'illumina':
             user = self.runobj.samples[key].data_owner
             project = self.runobj.samples[key].project
             sample_source_id = self.runobj.samples[key].env_sample_source_id
         elif self.runobj.platform == '454':
             pass
         else:
             pass
     project = project[0].capitalize() + project[1:]
     cursor = self.conn.cursor()
     
     query = "SELECT last_name,first_name,email,institution from vamps_auth where user='******'" % (user)
     #data = myconn.execute_fetch_select(query)
     cursor.execute(query)
     data = cursor.fetchone()
     
     fh = open(file_collector['project_info_file'],'w')
     title="title"
     description='description'
     contact= data[1]+' '+data[0]
     email= data[2]
     institution= data[3]
     
     fh.write("\t".join(["HEADER","project","title","description","contact", "email","institution","user","env_source_id"] )+"\n")
     fh.write("\t"+"\t".join([project, title, description, contact, email, institution, user, sample_source_id] )+"\n")
     # if this project already exists in the db???
     # the next step should update the table rather than add new to the db
     
     fh.close()
     self.conn.commit()
     cursor.close()
     
     logger.info("Finishing VAMPS info()")
    def chimera_denovo(self,lane_keys):
        
        chimera_region_found = False
        output = {}
        cluster_id_list = []
        for lane_key in lane_keys:
            
            dna_region  = self.run.samples[lane_key].dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
            out_fileName = self.prefix[lane_key] + ".chimeras.txt"        
            #clusterize uchime454 -replace -r self.rundate -t chimeras_denovo
            
            uchime_cmd = ["clusterize"]
            uchime_cmd.append(self.usearch_cmd)
            uchime_cmd.append("--uchime")
            uchime_cmd.append(self.files[lane_key]['abund'])
            uchime_cmd.append("--uchimeout")
            uchime_cmd.append(out_fileName)
            uchime_cmd.append("--abskew")
            uchime_cmd.append(self.abskew)
            
            try:
                logger.info("chimera denovo command: " + str(uchime_cmd))
                output[lane_key] = subprocess.check_output(uchime_cmd)
                #print output[lane_key]
                #print output[lane_key].split()[2]
                cluster_id_list.append(output[lane_key].split()[2])
                #print 'Have %d bytes in output' % len(output)
                #print 'denovo',lane_key,output,len(output)
                # len(output) is normally = 47
                if len(output[lane_key]) < 50 and len(output[lane_key]) > 40:
                    logger.debug(lane_key + " uchime denovo seems to have been submitted successfully")
                else:
                    logger.debug("uchime denovo may have broken")                    

            except OSError, e:
                print >>sys.stderr, "Execution failed:", e               
Exemple #12
0
    def check_for_input_files(self, data_object):
    
        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']

            
            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print data_object[x]['file_prefix']
                        
                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])
                        
                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print "No input directory or directory permissions problem: "+input_dir
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print "Files LIST",data_object['general']['files_list']
        
        
        return data_object
    def check_for_input_files(self, data_object):

        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']


            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print(data_object[x]['file_prefix'])

                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])

                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print("No input directory or directory permissions problem: "+input_dir)
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print("Files LIST",data_object['general']['files_list'])


        return data_object
    def check_for_uniques_files(self,keys):
        logger.info("Checking for uniques file")
        if self.runobj.platform == 'vamps':
            # one fasta file or (one project and dataset from db)
            if os.path.exists(self.runobj.fasta_file):
                output_dir = os.path.join(self.basedir,keys[0])
                uniques_file = os.path.join(output_dir, keys[0]+'.unique.fa')
                names_file = os.path.join(output_dir, keys[0]+'.names')
                #import pipeline.fastaunique as fu
                #mothur_cmd = C.mothur_cmd+" \"#unique.seqs(fasta="+self.runobj.fasta_file+", outputdir="+os.path.join(self.basedir,keys[0])+"/);\""; 
                fastaunique_cmd = C.fastaunique_cmd +" -x -i "+self.runobj.fasta_file+" -o "+uniques_file+" -n "+names_file 
                print fastaunique_cmd
                #mothur_cmd = site_base+"/clusterize_vamps -site vampsdev -rd "+user+"_"+runcode+"_gast -rc "+runcode+" -u "+user+" /bioware/mothur/mothur \"#unique.seqs(fasta="+fasta_file+");\"";    
                subprocess.call(fastaunique_cmd, shell=True)
                
                #shutil.move('a.txt', 'b.kml')
                #os.rename(filename, filename[7:])
                #os.rename(filename, filename[7:])
            else:
                if self.runobj.project and self.runobj.dataset:
                    pass
                else:
                    pass
            #get from database
        else:
            pass
            
            
#         for key in keys:
#             fasta_file = ""
#             output_dir = os.path.join(self.basedir,key)
#             unique_file = os.path.join(output_dir, key+'.unique.fa')
#             if not os.path.exists(unique_file):
#                 mothur_cmd = C.mothur_cmd+" \"#unique.seqs(fasta="+fasta_file+", outputdir="+os.path.join(self.basedir,key)+"/);\""; 
#         
#                 #mothur_cmd = site_base+"/clusterize_vamps -site vampsdev -rd "+user+"_"+runcode+"_gast -rc "+runcode+" -u "+user+" /bioware/mothur/mothur \"#unique.seqs(fasta="+fasta_file+");\"";    
#                 subprocess.call(mothur_cmd, shell=True)
        return ("SUCCESS","check for uniques")        
Exemple #15
0
    def chimera_denovo(self):
        chimera_region_found = False
        output = {}
        cluster_id_list = []



        for idx_key in self.idx_keys:
            input_file_name  = os.path.join(self.indir,  idx_key +'.abund.fa')  
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir, idx_key +'.chimera.denovo')
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir,idx_key+".denovo.log")

                dna_region       = self.runobj.samples[idx_key].dna_region
                logger.debug("dna_region = %s" % dna_region)
                if self.runobj.vamps_user_upload:
                    # VAMPS users can chimera check regardless of region chosen
                    chimera_region_found = True
                else:
                    if dna_region in C.regions_to_chimera_check:
                        chimera_region_found = True
                    else:
                        logger.debug('region not checked: ' +  dna_region)
                        continue


                self.utils.print_both("input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name))

    #             uchime_cmd = C.clusterize_cmd
    #             uchime_cmd += " "
    #             uchime_cmd += self.usearch_cmd
    #             uchime_cmd += " --uchime "
    #             uchime_cmd += input_file_name
    #             uchime_cmd += " --uchimeout "
    #             uchime_cmd += output_file_name
    #             uchime_cmd += " --abskew "
    #             uchime_cmd += self.abskew
                uchime_cmd=''
                if self.use_cluster:
                    uchime_cmd += C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_denovo "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name

                logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd))
                 
                try:
                    logger.info("chimera denovo command: " + str(uchime_cmd))
    #                 subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                     

                    #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    output[idx_key] = subprocess.check_output(uchime_cmd, shell=True)
                    self.utils.print_both("output[idx_key] = %s" % output[idx_key])
                    self.utils.print_both(output[idx_key].split()[2])
                    cluster_id_list.append(output[idx_key].split()[2])



                except OSError, e:
                    self.utils.print_both("Problems with this command: %s" % (uchime_cmd))
                    if self.utils.is_local():
                        print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    else:
                        print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                        self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e))
                        raise                  
Exemple #16
0
def chimera(runobj):
    chimera_cluster_ids = [] 
    logger.debug("Starting Chimera Checker")
    # lets read the trim status file out here and keep those details out of the Chimera code
    idx_keys = get_keys(runobj)
    #new_lane_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]
    # Open run STATUS File here.
    # open in append mode because we may start the run in the middle
    # say at the gast stage and don't want to over write.
    # if we re-run trimming we'll get two trim status reports
    runobj.run_status_file_h = open(runobj.run_status_file_name, "a")
    
    mychimera = Chimera(runobj)
    logger.debug("\nStarting DeNovo Chimera")
    c_den    = mychimera.chimera_denovo()
    logger.debug("Ending DeNovo Chimera")
    if c_den[0] == 'SUCCESS':
        chimera_cluster_ids += c_den[2]   # add a list to a list
        logger.debug("chimera_cluster_ids: "+' '.join(chimera_cluster_ids))
        chimera_code='PASS'
    elif c_den[0] == 'NOREGION':
        chimera_code='NOREGION'
    elif c_den[0] == 'FAIL':
        chimera_code = 'FAIL'
    else:
        chimera_code='FAIL'

    logger.debug("Chimera DeNovo Code: "+chimera_code)
    logger.debug("\nStarting Reference Chimera")
    c_ref    = mychimera.chimera_reference()
    
    if c_ref[0] == 'SUCCESS':
        chimera_cluster_ids += c_ref[2]
        chimera_code='PASS'
    elif c_ref[0] == 'NOREGION':
        chimera_code = 'NOREGION'
    elif c_ref[0] == 'FAIL':
        chimera_code='FAIL'
    else:
        chimera_code='FAIL'
    
    #print chimera_cluster_ids
    runobj.chimera_status_file_h = open(runobj.chimera_status_file_name,"w")
    if chimera_code == 'PASS':  
        
        if runobj.use_cluster:
            chimera_cluster_code = wait_for_cluster_to_finish(chimera_cluster_ids) 
            if chimera_cluster_code[0] == 'SUCCESS':
                logger.info("Chimera checking finished successfully")
                runobj.chimera_status_file_h.write("CHIMERA SUCCESS\n")
                runobj.run_status_file_h.write("CHIMERA SUCCESS\n")
                
            else:
                logger.info("3-Chimera checking Failed")
                runobj.chimera_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n")
                runobj.run_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n")
                sys.exit("3-Chimera checking Failed")
        else:
            chimera_cluster_code = ['SUCCESS','Not using cluster']
            logger.info("Chimera checking finished without using cluster")
            runobj.chimera_status_file_h.write("CHIMERA SUCCESS--no cluster\n")
            runobj.run_status_file_h.write("CHIMERA SUCCESS--no cluster\n")
    elif chimera_code == 'NOREGION':
        logger.info("No regions found that need chimera checking")
        runobj.chimera_status_file_h.write("CHIMERA CHECK NOT NEEDED\n")
        runobj.run_status_file_h.write("CHIMERA CHECK NOT NEEDED\n")
        
    elif chimera_code == 'FAIL':
        logger.info("1-Chimera checking Failed")
        runobj.chimera_status_file_h.write("1-CHIMERA ERROR: \n")
        runobj.run_status_file_h.write("1-CHIMERA ERROR: \n")
        sys.exit("1-Chimera Failed")
    else:
        logger.info("2-Chimera checking Failed")
        runobj.chimera_status_file_h.write("2-CHIMERA ERROR: \n")
        runobj.run_status_file_h.write("2-CHIMERA ERROR: \n")
        sys.exit("2-Chimera checking Failed")
    
    sleep(2) 
    
    if  chimera_code == 'PASS' and  chimera_cluster_code[0] == 'SUCCESS':
        logger.info("Writing Chimeras to deleted files")
        mychimera.write_chimeras_to_deleted_file()

        # should also recreate fasta
        # then read chimera files and place (or replace) any chimeric read_id
        # into the deleted file.
        
        mymblutils = MBLPipelineFastaUtils(idx_keys, runobj)
        
        # write new cleaned files that remove chimera if apropriate
        # these are in fasta_mbl_pipeline.py
        # the cleaned file are renamed to the original name:
        # lane_key.unique.fa
        # lane_key.trimmed.fa
        # lane_key.names        -- 
        # lane_key.abund.fa     -- this file is for the uclust chimera script
        # lane_key.deleted.txt  -- no change in this file
        # THE ORDER IS IMPORTANT HERE:
        mymblutils.write_clean_fasta_file()
        mymblutils.write_clean_names_file()
        mymblutils.write_clean_uniques_file()
        mymblutils.write_clean_abundance_file()
 def exports(self, key, refid_collector, tax_collector, read_id_lookup, file_collector):
     """
     fill vamps_exports.txt file
 
     """
     logger.info("Starting vamps_upload: exports")
     print "Starting vamps_upload: exports"
     if self.runobj.vamps_user_upload:
         project = self.runobj.project
         dataset = key
     else:
         if self.runobj.platform == 'illumina':
             project = self.runobj.samples[key].project
             dataset = self.runobj.samples[key].dataset
         elif self.runobj.platform == '454':
             pass
         else:
             pass
     project = project[0].capitalize() + project[1:]
     project_dataset = project+'--'+dataset
     date_trimmed = 'unknown'
     dataset_description = dataset    
     
     fh = open(file_collector['export_file'],'w')
     # t.read_id, t.project, t.dataset, g.refhvr_ids, x.distance, x.taxonomy, t.sequence, x.rank," " t.entry_date
     fh.write("\t".join(["HEADER","read_id","project","dataset","refhvr_ids","distance","taxonomy","sequence", "rank","entry_date"] )+"\n")
     today     = str(datetime.date.today())
     # open original fasta file
     if os.path.exists(file_collector['original_fa_file']) and os.path.getsize(file_collector['original_fa_file']) > 0:
         f = FastaReader(file_collector['original_fa_file'])
         while f.next():
             datarow = ['']
             id = f.id.split('|')[0]
             seq = f.seq
             if id in read_id_lookup:
                 tax = read_id_lookup[id]
             else: 
                 tax = ''
             if tax in tax_collector:
                 rank = tax_collector[tax]['rank']
             else:
                 rank = 'NA'
             
             if id in refid_collector:
                 distance = refid_collector[id]['distance']
                 refhvr_ids = refid_collector[id]['refhvr_ids']
             else:
                 distance = '1.0'
                 refhvr_ids = '0'
             datarow.append(id)
             datarow.append(project)
             datarow.append(dataset)
             datarow.append(refhvr_ids)
             datarow.append(distance)
             datarow.append(tax)
             datarow.append(seq)
             
             datarow.append(rank)
             datarow.append(today)
                       
             
             
             w = "\t".join(datarow)
             #print 'w',w
             fh.write(w+"\n")
             
         fh.close()    
     logger.info("Finishing VAMPS exports()")
    def chimera_denovo(self):
        chimera_region_found = False
        output = {}
        cluster_id_list = []

        for idx_key in self.idx_keys:
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + '.chimera.denovo')
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".denovo.log")

                dna_region = self.runobj.samples[idx_key].dna_region
                logger.debug("dna_region = %s" % dna_region)
                if self.runobj.vamps_user_upload:
                    # VAMPS users can chimera check regardless of region chosen
                    chimera_region_found = True
                else:
                    if dna_region in C.regions_to_chimera_check:
                        chimera_region_found = True
                    else:
                        logger.debug('region not checked: ' + dna_region)
                        continue

                self.utils.print_both(
                    "input_file_name = %s \noutput_file_name = %s" %
                    (input_file_name, output_file_name))

                #             uchime_cmd = C.clusterize_cmd
                #             uchime_cmd += " "
                #             uchime_cmd += self.usearch_cmd
                #             uchime_cmd += " --uchime "
                #             uchime_cmd += input_file_name
                #             uchime_cmd += " --uchimeout "
                #             uchime_cmd += output_file_name
                #             uchime_cmd += " --abskew "
                #             uchime_cmd += self.abskew
                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd += C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_denovo "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name

                logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd))

                try:
                    logger.info("chimera denovo command: " + str(uchime_cmd))
                    #                 subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

                    self.utils.print_both("chimera denovo command: " +
                                          str(uchime_cmd))
                    #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    self.utils.print_both("chimera denovo result: " +
                                          str(output[idx_key]))
                    #self.utils.print_both("output[idx_key] = %s" % output[idx_key])
                    #if idx_key in output and len(output[idx_key].split()) > 1:
                    #self.utils.print_both(output[idx_key].split()[2])
                    items = output[idx_key].split()
                    if len(items) > 2:
                        cluster_id_list.append(items[2])

                except OSError:
                    e = sys.exc_info()[1]
                    self.utils.print_both(
                        "Error: Problems with this command: %s" % (uchime_cmd))
                    if self.utils.is_local():
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                    else:
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                        self.utils.print_both(
                            "Error: Execution of %s failed: %s" %
                            (uchime_cmd, e))
                        raise

# ???
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        # ???
#         for idx_key in output:
#             if len(output[idx_key]) > 50 or len(output[idx_key]) < 40:
#                 return ('ERROR','uchime ref may have broken or empty', idx_key)

# finally
        self.utils.print_both('Finished Chimera Denovo')
        if cluster_id_list:
            return ('SUCCESS',
                    'uchime ref seems to have been submitted successfully',
                    cluster_id_list)
        else:
            return ('ERROR', 'uchime ref returned no cluster IDs',
                    cluster_id_list)
    def assign_taxonomy(self, gast_dir, dna_region, names_file,  ref_taxa):
        from pipeline.taxonomy import Taxonomy,consensus
        #results = uc_results
        results = {}
        
        # open gast_file to get results
        tagtax_terse_filename     = os.path.join(gast_dir,"tagtax_terse")
        tagtax_long_filename     = os.path.join(gast_dir,"tagtax_long")
        tagtax_terse_fh = open(tagtax_terse_filename,'w')
        tagtax_long_fh = open(tagtax_long_filename,'w')
        tagtax_long_fh.write("\t".join(["read_id","taxonomy","distance","rank","refssu_count","vote","minrank","taxa_counts","max_pcts","na_pcts","refhvr_ids"])+"\n")
        gast_file          = os.path.join(gast_dir, "gast"+dna_region)
        if not os.path.exists(gast_file):
            logger.info("Could not find gast file: "+gast_file)
            sys.exit("Could not find gast file: "+gast_file)
        for line in  open(gast_file,'r'): 
            # must split on tab because last field may be empty and must be maintained as blank
            data=line.strip().split("\t")
            if len(data) == 3:
                data.append("")
            # 0=id, 1=ref, 2=dist, 3=align
            results[data[0]]=[data[1].split('|')[0],data[2],data[3]]
            
        for read in results:
            #print read, results[read]
            pass
        
        for line in  open(names_file,'r'):
            data=line.strip().split("\t")
            dupes = data[1].split(",")
            read  = data[0]
            taxObjects  = []
            distance    =0
            refs_for    ={}
            #print 'read',read
            if read not in results:
                results[read]=["Unknown", '1', "NA", '0', '0', "NA", "0;0;0;0;0;0;0;0", "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100"]
                refs_for[read] = [ "NA" ]
            else:
                #print 'read in res',read, results[read]
                for i in range( 0,len(results[read])):
                #for resultread in results[read]:
                    #print results[read]
                    ref = results[read][0]
                    if ref in ref_taxa:
                        for tax in ref_taxa[ref]:
                            for t in tax:
                                taxObjects.append(Taxonomy(t))
                    else:
                        pass
                    if read in refs_for:
                        if results[read][0] not in refs_for[read]:
                            refs_for[read].append(results[read][0])  
                    else:
                        refs_for[read] = [results[read][0]]                   
                     
                    # should all be the same distance
                    distance = results[read][1]
                #Lookup the consensus taxonomy for the array
                taxReturn = consensus(taxObjects, C.majority)
                
                # 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts;
                taxon = taxReturn[0].taxstring()
                rank = taxReturn[0].depth()
                #print read,taxon,rank,taxReturn[0],taxReturn[1]
                if not taxon: taxon = "Unknown"
            
                # (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts)
                results[read] = [ taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5] ] 
                #print "\t".join([read,taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5]]) + "\n"
#read_id taxonomy        distance        rank    refssu_count    vote    minrank taxa_counts     max_pcts        na_pcts refhvr_ids
#D4ZHLFP1:25:B022DACXX:3:1101:12919:40734 1:N:0:TGACCA|frequency:162     Bacteria;Proteobacteria;Gammaproteobacteria     0.117   class   2       100     genus   1;1;1;2;2;2;0;0 100;100;100;50;50;50;0;0        0;0;0;0;0;0;100;100     v6_CI671
#D4ZHLFP1:25:B022DACXX:3:1101:10432:76870 1:N:0:TGACCA|frequency:105     Bacteria;Proteobacteria;Gammaproteobacteria     0.017   class   1       100     class   1;1;1;0;0;0;0;0 100;100;100;0;0;0;0;0   0;0;0;100;100;100;100;100       v6_BW306
                
            # Replace hash with final taxonomy results, for each copy of the sequence
            for d in dupes:
               # print OUT join("\t", $d, @{$results{$read}}, join(",", sort @{$refs_for{$read}})) . "\n";
                tagtax_long_fh.write( d+"\t"+"\t".join(results[read])+"\t"+','.join(sorted(refs_for[read]))  + "\n")
                tagtax_terse_fh.write(d+"\t"+results[read][0]+"\t"+results[read][2]+"\t"+results[read][3]+"\t"+','.join(sorted(refs_for[read]))+"\t"+results[read][1]+"\n")
               
        tagtax_terse_fh.close()
        return results
def chimera(runobj):
    chimera_cluster_ids = [] 
    logger.debug("Starting Chimera Checker")
    # lets read the trim status file out here and keep those details out of the Chimera code
    idx_keys = get_keys(runobj)
    #new_lane_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]
    
    mychimera = Chimera(runobj)
    
    c_den    = mychimera.chimera_denovo(idx_keys)
    if c_den[0] == 'SUCCESS':
        chimera_cluster_ids += c_den[2]
        chimera_code='PASS'
    elif c_den[0] == 'NOREGION':
        chimera_code='NOREGION'
    elif c_den[0] == 'FAIL':
        chimera_code = 'FAIL'
    else:
        chimera_code='FAIL'
    
    c_ref    = mychimera.chimera_reference(idx_keys)
    
    if c_ref[0] == 'SUCCESS':
        chimera_cluster_ids += c_ref[2]
        chimera_code='PASS'
    elif c_ref[0] == 'NOREGION':
        chimera_code = 'NOREGION'
    elif c_ref[0] == 'FAIL':
        chimera_code='FAIL'
    else:
        chimera_code='FAIL'
    
    #print chimera_cluster_ids
    runobj.chimera_status_file_h = open(runobj.chimera_status_file_name,"w")
    if chimera_code == 'PASS':  
        
        chimera_cluster_code = wait_for_cluster_to_finish(chimera_cluster_ids) 
        if chimera_cluster_code[0] == 'SUCCESS':
            logger.info("Chimera checking finished successfully")
            runobj.chimera_status_file_h.write("CHIMERA SUCCESS\n")
            runobj.run_status_file_h.write("CHIMERA SUCCESS\n")
            
        else:
            logger.info("3-Chimera checking Failed")
            runobj.chimera_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n")
            runobj.run_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n")
            sys.exit("3-Chimera checking Failed")
            
    elif chimera_code == 'NOREGION':
        logger.info("No regions found that need chimera checking")
        runobj.chimera_status_file_h.write("CHIMERA CHECK NOT NEEDED\n")
        runobj.run_status_file_h.write("CHIMERA CHECK NOT NEEDED\n")
        
    elif chimera_code == 'FAIL':
        logger.info("1-Chimera checking Failed")
        runobj.chimera_status_file_h.write("1-CHIMERA ERROR: \n")
        runobj.run_status_file_h.write("1-CHIMERA ERROR: \n")
        sys.exit("1-Chimera Failed")
    else:
        logger.info("2-Chimera checking Failed")
        runobj.chimera_status_file_h.write("2-CHIMERA ERROR: \n")
        runobj.run_status_file_h.write("2-CHIMERA ERROR: \n")
        sys.exit("2-Chimera checking Failed")
    sleep(2)   
    if  chimera_code == 'PASS' and  chimera_cluster_code[0] == 'SUCCESS':
        mychimera.write_chimeras_to_deleted_file(idx_keys)
        # should also recreate fasta
        # then read chimera files and place (or replace) any chimeric read_id
        # into the deleted file.
        
        mymblutils = MBLPipelineFastaUtils(idx_keys, mychimera.outdir)
        
        # write new cleaned files that remove chimera if apropriate
        # these are in fasta_mbl_pipeline.py
        # the cleaned file are renamed to the original name:
        # lane_key.unique.fa
        # lane_key.trimmed.fa
        # lane_key.names        -- 
        # lane_key.abund.fa     -- this file is for the uclust chimera script
        # lane_key.deleted.txt  -- no change in this file
        # THE ORDER IS IMPORTANT HERE:
        mymblutils.write_clean_fasta_file()
        mymblutils.write_clean_names_file()
        mymblutils.write_clean_uniques_file()
        mymblutils.write_clean_abundance_file()
        # write keys file for each lane_key - same fields as db table? for easy writing
        # write primers file for each lane_key
 
        
        # Write new clean files to the database
        # rawseq table not used
        # trimseq
        # runkeys
        # primers
        # run primers
        mymblutils.write_clean_files_to_database()
    # this will read the args and ini file and return a dictionary

    data_object = v.validate_args()
#    for attr in dir(data_object):
#        print("obj.%s = %s" % (attr, getattr(data_object, attr)))



    # set logging


    print("\nLog Level set to:", args.loglevel)
    logger.setLevel(args.loglevel.upper() )

    logger.info("Starting pipeline")
    ##############
    #
    #  Test cl parameters
    #
    ##############
    # CL RULES:
    # for ini file:  (no plurals)
    # 1) CL: input_dir ONLY shall be supplied on CL - no input filenames
    #
    # 2) All input files should be in the same directory AND of the same format
    #
    # 3) Supply a input_file_suffix on the CL if there are varying file types in the
    #       input_dir and you only are using some (default will read all files)
    # 4)
    #
Exemple #22
0
    def chimera_reference(self):

        chimera_region_found = False
        output = {}
        cluster_id_list = []
        for idx_key in self.run_keys:
             
            dna_region  = self.runobj.samples[idx_key].dna_region
            if self.runobj.vamps_user_upload:
                # VAMPS users can chimera check regardless of region chosen
                chimera_region_found = True
            else:
                if dna_region in C.regions_to_chimera_check:
                    chimera_region_found = True
                else:
                    logger.debug('region not checked: ' + dna_region)                    
                    continue


            input_file_name  = os.path.join(self.indir,  idx_key +'.abund.fa') 
            output_file_name    = os.path.join(self.outdir,idx_key+".chimera.ref") 
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir,idx_key+".ref.log") 
            logger.debug("OUT FILE NAME: " + output_file_name)     
             
            #out_file_name = self.prefix[idx_key] + ".chimeras.db"      
            input_file_name  = os.path.join(self.indir,  idx_key +'.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name    = os.path.join(self.outdir,idx_key+".chimera.ref") 
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir,idx_key+".ref.log") 
                logger.debug("OUT FILE NAME: " + output_file_name)
                # which ref db to use?
                ref_db = ''
                if dna_region.upper() == 'ITS':
                    logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
                    ref_db = self.its_refdb
                else:
                    logger.debug("using standard refdb: " + self.refdb)
                    ref_db = self.refdb
                     
                uchime_cmd=''
                if self.use_cluster:
                    uchime_cmd = C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_ref "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name
                uchime_cmd += " -db "
                uchime_cmd += ref_db
                uchime_cmd += " -strand "
                uchime_cmd += "plus"

                logger.debug("uchime_ref_cmd = %s" % (uchime_cmd))  
                              
                try:
                    logger.info("chimera reference command: " + str(uchime_cmd))
                    output[idx_key] = subprocess.check_output(uchime_cmd, shell=True)
                    #print 'outsplit',output[idx_key].split()[2]
                    cluster_id_list.append(output[idx_key].split()[2])
                    #print 'Have %d bytes in output' % len(output)
                    #print 'ref',idx_key,output,len(output)
                    if len(output[idx_key]) < 50 and len(output[idx_key]) > 40:
                        logger.debug(idx_key + " uchime ref seems to have been submitted successfully")                    
                    else:
                        if self.use_cluster:
                            print >>sys.stderr, "uchime ref may be broke"
                            self.utils.print_both("uchime ref may be broke")
                    
                except OSError, e:
                    print >>sys.stderr, "Execution of chimera_reference failed: %s" % (uchime_cmd, e)
                    self.utils.print_both("Execution of chimera_reference failed: %s" % (uchime_cmd, e))
                    raise
    def chimera_denovo(self):
        chimera_region_found = False
        output = {}
        cluster_id_list = []


        
        for idx_key in self.idx_keys:
            input_file_name  = os.path.join(self.indir,  idx_key +'.abund.fa')  
            output_file_name = os.path.join(self.outdir, idx_key +'.chimera.denovo')
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir,idx_key+".denovo.log")

            dna_region       = self.runobj.samples[idx_key].dna_region
            logger.debug("dna_region = %s" % dna_region)
            if self.runobj.vamps_user_upload:
                # VAMPS users can chimera check regardless of region chosen
                chimera_region_found = True
            else:
                if dna_region in C.regions_to_chimera_check:
                    chimera_region_found = True
                else:
                    logger.debug('region not checked: ' +  dna_region)
                    continue
             
 
            print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
 
#             uchime_cmd = C.clusterize_cmd
#             uchime_cmd += " "
#             uchime_cmd += self.usearch_cmd
#             uchime_cmd += " --uchime "
#             uchime_cmd += input_file_name
#             uchime_cmd += " --uchimeout "
#             uchime_cmd += output_file_name
#             uchime_cmd += " --abskew "
#             uchime_cmd += self.abskew
            
            uchime_cmd = C.clusterize_cmd
            uchime_cmd += " "
            uchime_cmd += " -log "
            uchime_cmd += log_file
            uchime_cmd += " "
            uchime_cmd += self.usearch_cmd
            uchime_cmd += " -uchime_denovo "
            uchime_cmd += input_file_name
            uchime_cmd += " -uchimeout "
            uchime_cmd += output_file_name

            logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd))
             
            try:
                logger.info("chimera denovo command: " + str(uchime_cmd))
#                 subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                 

                #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                output[idx_key] = subprocess.check_output(uchime_cmd, shell=True)
                print "output[idx_key] = %s" % output[idx_key]
                print output[idx_key].split()[2]
                cluster_id_list.append(output[idx_key].split()[2])

 
  
 
            except OSError, e:
                print "Problems with this command: %s" % (uchime_cmd)
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    raise                  
Exemple #24
0
    def trim_by_quality(self,
                        infile=None,
                        format='sanger',
                        wsize=1,
                        wstep=1,
                        trim_ends='53',
                        agg_action='min',
                        exc_count=0,
                        score_comp='>=',
                        qual_score=0,
                        filter_first50=False,
                        filter_Ns=False,
                        filter_Nx=0,
                        failed_fastq=False,
                        length=0,
                        trim=0,
                        clip=0,
                        keep_zero_length=False):
        #format
        window_size = wsize
        window_step = wstep
        #trim_ends
        aggregation_action = agg_action
        exclude_count = exc_count
        score_comparison = score_comp
        quality_score = qual_score
        filter_length = length
        trim_length = trim
        clip_length = clip
        if not infile:
            sys.exit("illumina_fastq_trimmer: Need to specify an input file")

        if window_size < 1:
            sys.exit(
                'illumina_fastq_trimmer: You must specify a strictly positive window size'
            )

        if window_step < 1:
            sys.exit(
                'illumina_fastq_trimmer: You must specify a strictly positive step size'
            )

        print("\nRunning illumina Filtering")

        in_filepath = os.path.join(self.indir, infile)
        try:
            filebase = infile.split('/')[1].split('.')[0]
        except:
            filebase = infile.split('.')[0]

        out_filename = filebase + ".filtered.fastq"
        out_filepath = os.path.join(self.outdir, out_filename)

        #determine an exhaustive list of window indexes that can be excluded from aggregation
        exclude_window_indexes = []
        last_exclude_indexes = []
        for exclude_count in range(min(exclude_count, window_size)):
            if last_exclude_indexes:
                new_exclude_indexes = []
                for exclude_list in last_exclude_indexes:
                    for window_index in range(window_size):
                        if window_index not in exclude_list:
                            new_exclude = sorted(exclude_list + [window_index])
                            if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                                new_exclude_indexes.append(new_exclude)
                exclude_window_indexes += new_exclude_indexes
                last_exclude_indexes = new_exclude_indexes
            else:
                for window_index in range(window_size):
                    last_exclude_indexes.append([window_index])
                exclude_window_indexes = list(last_exclude_indexes)
        out = fastqWriter(open(out_filepath, 'wb'), format=format)
        action = ACTION_METHODS[aggregation_action]
        if failed_fastq:
            fail = fastqWriter(open(out_filepath + '.failed', 'wb'),
                               format=format)
        num_reads = None
        num_reads_excluded = 0
        count_of_unchaste = 0
        count_of_trimmed = 0
        count_of_first50 = 0
        count_of_Ns = 0
        if self.runobj.compressed:
            import gzip
            try:
                logger.info("illumina_filtering: opening compressed file: " +
                            in_filepath)
                fp = gzip.open(in_filepath)
            except:
                logger.info("illumina_filtering: opening uncompressed file: " +
                            in_filepath)
                fp = open(in_filepath)
        else:
            logger.info("illumina_filtering: opening uncompressed file: " +
                        in_filepath)
            fp = open(in_filepath)
        for num_reads, fastq_read in enumerate(fastqReader(fp, format=format)):
            ############################################################################################
            # Put chastity code here
            #print(fastq_read.identifier)
            seq = fastq_read.get_sequence()

            desc_items = fastq_read.identifier.split(':')

            if desc_items[7] == 'Y':
                count_of_unchaste += 1
                #print('failed chastity')
                if failed_fastq:
                    fail.write(fastq_read)
                continue

            # Filter reads with ambiguous bases
            if filter_Ns:
                countN = seq.count('N')
                if countN > 1 or (countN == 1
                                  and seq[filter_Nx - 1:filter_Nx] != 'N'):
                    #print('failed Ns',infile)
                    count_of_Ns += 1
                    if failed_fastq:
                        fail.write(fastq_read)
                    continue

            # Filter reads below first 50 base quality
            if filter_first50:
                first50 = 50
                first50_maxQ = 30
                first50_maxQ_count = 34

                quals = fastq_read.get_decimal_quality_scores()[:first50]
                count_lt30 = 0

                for q in quals:
                    if q < first50_maxQ:
                        count_lt30 += 1
                if count_lt30 >= first50_maxQ_count:
                    #print('failed first50')
                    if failed_fastq:
                        fail.write(fastq_read)
                    count_of_first50 += 1
                    continue

            ##### END CHASTITY #####################
            ############################################################################################
            ##### START Btails CODE ################
            quality_list = fastq_read.get_decimal_quality_scores()

            for trim_end in trim_ends:

                if trim_end == '5':
                    lwindow_position = 0  #left position of window
                    while True:
                        if lwindow_position >= len(quality_list):
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare(
                                action, quality_list[
                                    lwindow_position:lwindow_position +
                                    window_size], score_comparison,
                                quality_score, exclude_window_indexes):
                            fastq_read = fastq_read.slice(
                                lwindow_position, None)
                            break
                        lwindow_position += window_step
                else:
                    rwindow_position = len(
                        quality_list)  #right position of window
                    while True:
                        lwindow_position = rwindow_position - window_size  #left position of window
                        if rwindow_position <= 0 or lwindow_position < 0:
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare(
                                action, quality_list[
                                    lwindow_position:rwindow_position],
                                score_comparison, quality_score,
                                exclude_window_indexes):
                            fastq_read = fastq_read.slice(
                                None, rwindow_position)
                            break
                        rwindow_position -= window_step

            ######## END Btails CODE ###############################
            ############################################################################################
            # put  length/trim/clip code here
            quality_list = fastq_read.get_decimal_quality_scores()

            if filter_length:
                if len(quality_list) < filter_length:
                    print('failed length')
                    if failed_fastq:
                        fail.write(fastq_read)
                    continue

            # Trim initial bases -- remove first 10 bases from read 2
            if clip_length:
                # remove from the front:
                fastq_read = fastq_read.slice(clip_length, None)
                count_of_trimmed += 1

            # Trim to max length -- read 2 trim to 90.
            if trim_length:
                if len(quality_list) > trim_length:
                    # remove from the end:
                    fastq_read = fastq_read.slice(
                        None,
                        len(fastq_read.get_sequence()) - trim_length)
                    count_of_trimmed += 1

            if keep_zero_length or len(fastq_read):
                out.write(fastq_read)
            else:
                num_reads_excluded += 1
        out.close()
        if failed_fastq:
            fail.close()
        print("file:", infile)
        print('count_of_trimmed             (for length):', count_of_trimmed)
        print('count_of_first50 (avg first50 quals < 34):', count_of_first50)
        print("count_of_unchaste             ('Y' in id):", count_of_unchaste)
        print('count_of_Ns                (reads with N):', count_of_Ns)
        if num_reads is None:
            print("No valid FASTQ reads could be processed.")
        else:
            print("%i FASTQ reads were processed." % (num_reads + 1))
        if num_reads_excluded:
            print("%i reads of zero length were excluded from the output." %
                  num_reads_excluded)

        return out_filename
    def convert_csv_to_ini(self, new_ini_file):
        #print(self.args)
        from pipeline.get_ini import readCSV

        print('CSV path', self.general_config_dict['csvPath'])
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])

        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print(content[1])
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.items():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])

        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        fh.write("[general]\n")
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")

        fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n")
        fh.write("platform = " + self.general_config_dict['platform']+"\n")
        fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] in C.illumina_list:
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("do_perfect = "          + str(self.general_config_dict['do_perfect'])+"\n")
            fh.write("lane_name = "          + str(self.general_config_dict['lane_name'])+"\n")
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")

        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("site = "              + self.general_config_dict['site']+"\n")
        fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            fh.write("input_files = \n")
        #fh.write(getattr(args,'force_runkey', ""))

        for k, values in content.items():
            fh.write("\n")
            if self.general_config_dict['platform'] in C.illumina_list:
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")

            for v in values:
                if v == "env_sample_source":
                    try:
                        new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0]
                    except:
                        text = """There was an error in env_sample_source. Please check your metadata.
Possible values:
-----------
air
extreme habitat
host associated
human associated
human-amniotic-fluid
human-blood
human-gut
human-oral
human-skin
human-urine
human-vaginal
indoor
microbial mat/biofilm
miscellaneous_natural_or_artificial_environment
plant associated
sediment
soil/sand
unknown
wastewater/sludge
water-freshwater
water-marine
-----------
"""
                        print(text)
                        raise
                    fh.write("env_sample_source_id = "+new_val+"\n")
                else:
                    fh.write(v+" = "+values[v]+"\n")

        fh.close()

        return new_ini_file
Exemple #26
0
    def convert_csv_to_ini(self, new_ini_file):
        #print self.args
        from pipeline.get_ini import readCSV
        
        print 'CSV path', self.general_config_dict['csvPath']
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])
        
        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print content[1]
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.iteritems():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])
        
        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")  
        fh.write("[general]\n") 
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")
        
        fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n")
        fh.write("platform = " + self.general_config_dict['platform']+"\n")
        fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] == 'illumina':
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("do_perfect = "          + str(self.general_config_dict['do_perfect'])+"\n")
            fh.write("lane_name = "          + str(self.general_config_dict['lane_name'])+"\n")            
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")
            
        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("site = "              + self.general_config_dict['site']+"\n")
        fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n") 
        else:
            fh.write("input_files = \n") 
        #fh.write(getattr(args,'force_runkey', ""))        
 
        for k, values in content.iteritems():
            fh.write("\n")
            if self.general_config_dict['platform'] == 'illumina':
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")
            
            for v in values:
                if v == "env_sample_source":
                    try:
                        new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0]
                    except:
                        print """There was an error in env_sample_source. Please check your metadata. 
Possible values:  
-----------
air
extreme habitat
host associated
human associated
human-amniotic-fluid
human-blood
human-gut
human-oral
human-skin
human-urine
human-vaginal
indoor
microbial mat/biofilm
miscellaneous_natural_or_artificial_environment
plant associated
sediment
soil/sand
unknown
wastewater/sludge
water-freshwater
water-marine
-----------
"""
                        raise
                    fh.write("env_sample_source_id = "+new_val+"\n")
                else:
                    fh.write(v+" = "+values[v]+"\n")
                
        fh.close()
        
        return new_ini_file 
Exemple #27
0
def chimera(runobj):
    chimera_cluster_ids = []
    logger.debug("Starting Chimera Checker")
    # lets read the trim status file out here and keep those details out of the Chimera code
    idx_keys = get_keys(runobj)
    # new_lane_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]
    # Open run STATUS File here.
    # open in append mode because we may start the run in the middle
    # say at the gast stage and don't want to over write.
    # if we re-run trimming we'll get two trim status reports
    runobj.run_status_file_h = open(runobj.run_status_file_name, "a")

    mychimera = Chimera(runobj)
    logger.debug("\nStarting DeNovo Chimera")
    c_den = mychimera.chimera_denovo()
    logger.debug("Ending DeNovo Chimera")
    if c_den[0] == 'SUCCESS':
        chimera_cluster_ids += c_den[2]  # add a list to a list
        logger.debug("chimera_cluster_ids: " + ' '.join(chimera_cluster_ids))
        chimera_code = 'PASS'
    elif c_den[0] == 'NOREGION':
        chimera_code = 'NOREGION'
    elif c_den[0] == 'FAIL':
        chimera_code = 'FAIL'
    else:
        chimera_code = 'FAIL'

    logger.debug("Chimera DeNovo Code: " + chimera_code)
    logger.debug("\nStarting Reference Chimera")
    c_ref = mychimera.chimera_reference()

    if c_ref[0] == 'SUCCESS':
        chimera_cluster_ids += c_ref[2]
        chimera_code = 'PASS'
    elif c_ref[0] == 'NOREGION':
        chimera_code = 'NOREGION'
    elif c_ref[0] == 'FAIL':
        chimera_code = 'FAIL'
    else:
        chimera_code = 'FAIL'

    # logger.debug(chimera_cluster_ids)
    runobj.chimera_status_file_h = open(runobj.chimera_status_file_name, "w")
    if chimera_code == 'PASS':

        if runobj.use_cluster:
            chimera_cluster_code = wait_for_cluster_to_finish(
                chimera_cluster_ids)
            if chimera_cluster_code[0] == 'SUCCESS':
                logger.info("Chimera checking finished successfully")
                runobj.chimera_status_file_h.write("CHIMERA SUCCESS\n")
                runobj.run_status_file_h.write("CHIMERA SUCCESS\n")

            else:
                logger.info("3-Chimera checking Failed")
                runobj.chimera_status_file_h.write(
                    "3-CHIMERA ERROR: " + str(chimera_cluster_code[1]) + " " +
                    str(chimera_cluster_code[2]) + "\n")
                runobj.run_status_file_h.write("3-CHIMERA ERROR: " +
                                               str(chimera_cluster_code[1]) +
                                               " " +
                                               str(chimera_cluster_code[2]) +
                                               "\n")
                sys.exit("3-Chimera checking Failed")
        else:
            chimera_cluster_code = ['SUCCESS', 'Not using cluster']
            logger.info("Chimera checking finished without using cluster")
            runobj.chimera_status_file_h.write("CHIMERA SUCCESS--no cluster\n")
            runobj.run_status_file_h.write("CHIMERA SUCCESS--no cluster\n")
    elif chimera_code == 'NOREGION':
        logger.info("No regions found that need chimera checking")
        runobj.chimera_status_file_h.write("CHIMERA CHECK NOT NEEDED\n")
        runobj.run_status_file_h.write("CHIMERA CHECK NOT NEEDED\n")

    elif chimera_code == 'FAIL':
        logger.info("1-Chimera checking Failed")
        runobj.chimera_status_file_h.write("1-CHIMERA ERROR: \n")
        runobj.run_status_file_h.write("1-CHIMERA ERROR: \n")
        sys.exit("1-Chimera Failed")
    else:
        logger.info("2-Chimera checking Failed")
        runobj.chimera_status_file_h.write("2-CHIMERA ERROR: \n")
        runobj.run_status_file_h.write("2-CHIMERA ERROR: \n")
        sys.exit("2-Chimera checking Failed")

    sleep(2)

    if chimera_code == 'PASS' and chimera_cluster_code[0] == 'SUCCESS':
        logger.info("Writing Chimeras to deleted files")
        mychimera.write_chimeras_to_deleted_file()

        # should also recreate fasta
        # then read chimera files and place (or replace) any chimeric read_id
        # into the deleted file.

        mymblutils = MBLPipelineFastaUtils(idx_keys, runobj)

        # write new cleaned files that remove chimera if apropriate
        # these are in fasta_mbl_pipeline.py
        # the cleaned file are renamed to the original name:
        # lane_key.unique.fa
        # lane_key.trimmed.fa
        # lane_key.names        --
        # lane_key.abund.fa     -- this file is for the uclust chimera script
        # lane_key.deleted.txt  -- no change in this file
        # THE ORDER IS IMPORTANT HERE:
        mymblutils.write_clean_fasta_file()
        mymblutils.write_clean_names_file()
        mymblutils.write_clean_uniques_file()
        mymblutils.write_clean_abundance_file()
Exemple #28
0
def gast(runobj):
    logger.info("STARTING GAST()")
    #     logger.info("vsearch version: " % utils.get_vsearch_version)
    # for vamps 'new_lane_keys' will be prefix
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    # Should return a list not a string
    idx_keys = get_keys(runobj)

    # get GAST object
    mygast = Gast(runobj, idx_keys)

    # Check for unique files and create them if not there
    result_code = mygast.check_for_unique_files(idx_keys)
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("uniques not found failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "uniques file not found - failed")
        sys.exit("uniques not found failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])

    sleep(5)

    # CLUSTERGAST
    result_code = mygast.clustergast()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("clutergast failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "clustergast failed")
        sys.exit("clustergast failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])

    sleep(5)

    # GAST_CLEANUP
    result_code = mygast.gast_cleanup()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast_cleanup failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "gast_cleanup failed")
        sys.exit("gast_cleanup failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])

    sleep(5)

    # GAST2TAX
    result_code = mygast.gast2tax()
    runobj.run_status_file_h.write(json.dumps(result_code) + "\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast2tax failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR",
                                     "gast2tax failed")
        sys.exit("gast2tax failed")
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db(runobj.site, runobj.run,
                                 result_code['status'], result_code['message'])
    parser.add_argument('-file_base',               required=True, action="store",   dest = "file_base", 
                                                    help = 'where the files are loacated')                                                   
                                                    
## optional       
    parser.add_argument("-dna_region",       required=False,  action="store",   dest = "dna_region", default='unknown',
                                                    help="") 
    parser.add_argument("-domain",       required=False,  action="store",   dest = "domain", default='unknown',
                                                    help="")                                                 
    parser.add_argument('-d', '--dataset',      required=False, action="store",   dest = "dataset",  
                                                    help = '')                                                 
    parser.add_argument("-p", "--project",      required=False,  action="store",   dest = "project", 
                                                    help="")                                                   
                                                              
                                                        
    
    logger.info("Starting vamps_load.py")
    args = parser.parse_args()
    
    data_object['infile'] = args.infile
    data_object['datetime'] = str(datetime.date.today())
    data_object['type'] = args.type
    data_object['runcode'] = args.runcode
    data_object['site'] =  args.site
    data_object['user'] =  args.user
    data_object['file_base'] =  args.file_base
    data_object['file_type'] =  args.file_type
    
    if args.dna_region:
        dna_region = args.dna_region
    data_object['dna_region'] = dna_region
    
    def chimera_reference(self):
     
        chimera_region_found = False
        output = {}
        cluster_id_list = []
        for idx_key in self.run_keys:
             
            dna_region  = self.runobj.samples[idx_key].dna_region
            if self.runobj.vamps_user_upload:
                # VAMPS users can chimera check regardless of region chosen
                chimera_region_found = True
            else:
                if dna_region in C.regions_to_chimera_check:
                    chimera_region_found = True
                else:
                    logger.debug('region not checked: ' + dna_region)                    
                    continue

             
            input_file_name  = os.path.join(self.indir,  idx_key +'.abund.fa') 
            output_file_name    = os.path.join(self.outdir,idx_key+".chimera.ref") 
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir,idx_key+".ref.log") 
            logger.debug("OUT FILE NAME: " + output_file_name)     
             
            #out_file_name = self.prefix[idx_key] + ".chimeras.db"      
            input_file_name  = os.path.join(self.indir,  idx_key +'.abund.fa') 
            output_file_name    = os.path.join(self.outdir,idx_key+".chimera.ref") 
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir,idx_key+".ref.log") 
            logger.debug("OUT FILE NAME: " + output_file_name)
            # which ref db to use?
            ref_db = ''
            if dna_region.upper() == 'ITS':
                logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
                ref_db = self.its_refdb
            else:
                logger.debug("using standard refdb: " + self.refdb)
                ref_db = self.refdb
                 
            uchime_cmd = C.clusterize_cmd
            uchime_cmd += " "
            uchime_cmd += " -log "
            uchime_cmd += log_file
            uchime_cmd += " "
            uchime_cmd += self.usearch_cmd
            uchime_cmd += " -uchime_ref "
            uchime_cmd += input_file_name
            uchime_cmd += " -uchimeout "
            uchime_cmd += output_file_name
            uchime_cmd += " -db "
            uchime_cmd += ref_db
            uchime_cmd += " -strand "
            uchime_cmd += "plus"

            logger.debug("uchime_ref_cmd = %s" % (uchime_cmd))  
                          
            try:
                logger.info("chimera reference command: " + str(uchime_cmd))
                output[idx_key] = subprocess.check_output(uchime_cmd, shell=True)
                #print 'outsplit',output[idx_key].split()[2]
                cluster_id_list.append(output[idx_key].split()[2])
                #print 'Have %d bytes in output' % len(output)
                #print 'ref',idx_key,output,len(output)
                if len(output[idx_key]) < 50 and len(output[idx_key]) > 40:
                    logger.debug(idx_key + " uchime ref seems to have been submitted successfully")                    
                else:
                    print >>sys.stderr, "uchime ref may be broke"
                
            except OSError, e:
                print >>sys.stderr, "Execution of chimera_reference failed: %s" % (uchime_cmd, e)
                raise
    def load_database(self,key,out_gast_dir, file_collector):
        """
        
        """
        logger.info("Starting load VAMPS data")
        print "Starting load VAMPS data"

        # USER: vamps_db_tables
        if self.runobj.vamps_user_upload:
            user = self.runobj.user
            project = self.runobj.project
            data_cube_table     = C.database_tables['vamps_user_uploads']['tax_dc_tbl']
            summed_cube_table   = C.database_tables['vamps_user_uploads']['tax_summed_tbl']
            taxonomy_table      = C.database_tables['vamps_user_uploads']['tax_tbl']
            sequences_table     = C.database_tables['vamps_user_uploads']['sequences_tbl']
            export_table        = C.database_tables['vamps_user_uploads']['export_tbl']
            datasets_table      = C.database_tables['vamps_user_uploads']['datasets_tbl']
            users_table         = C.database_tables['vamps_user_uploads']['users_tbl']
        else:    
            if self.runobj.platform == 'illumina':
                user = self.runobj[key].data_owner
                project = self.runobj.samples[key].project
                data_cube_table     = C.database_tables['vamps_mbl_origin']['tax_dc_tbl']
                summed_cube_table   = C.database_tables['vamps_mbl_origin']['tax_summed_tbl']
                taxonomy_table      = C.database_tables['vamps_mbl_origin']['tax_tbl']
                sequences_table     = C.database_tables['vamps_mbl_origin']['sequences_tbl']
                export_table        = C.database_tables['vamps_mbl_origin']['export_tbl']
                datasets_table      = C.database_tables['vamps_mbl_origin']['datasets_tbl']
                users_table         = C.database_tables['vamps_mbl_origin']['users_tbl']
            elif self.runobj.platform == '454':
                pass
            else:
                pass
        info_table          = C.database_tables['vamps_mbl_origin']['info_tbl']
        users_info_table    = C.database_tables['vamps_user_uploads']['info_tbl']    
        
        
        cursor = self.conn.cursor()
        
        
 
        #
        #  DATA_CUBE
        #
        print "loading "+key+": data_cube"
        if os.path.exists(file_collector['taxes_file']):
            for line in open(file_collector['taxes_file'],'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                qDataCube = "insert ignore into %s (project, dataset, taxon_string,superkingdom,phylum,class, orderx,family,genus,species,strain,\
                            rank,knt,frequency,dataset_count,classifier)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (data_cube_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],
                            line[8],line[9],line[10],line[11],line[12],line[13],line[14],line[15])
                #myconn.execute_no_fetch(qDataCube)
                #print qDataCube
                rows_affected = cursor.execute(qDataCube)
        else:
            print "taxes file not found for dataset "+key
            
            
        #
        # SUMMED (JUNK) DATA_CUBE
        #
        print "loading "+key+": junk_data_cube"
        if os.path.exists(file_collector['summed_taxes_file']):
            for line in open(file_collector['summed_taxes_file'],'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab
                #taxonomy        sum_tax_counts  frequency	dataset_count   rank    project dataset project--dataset        classifier
                qSummedCube = "insert ignore into %s (taxon_string,knt, frequency, dataset_count, rank, project, dataset, project_dataset, classifier)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (summed_cube_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8])
                #myconn.execute_no_fetch(qSummedCube) 
                #print qSummedCube 
                cursor.execute(qSummedCube)
        else:
            print "summed taxes file not found for dataset "+key
                
        #
        #  TAXONOMY
        #
        print "loading "+key+": taxonomy"
        if os.path.exists(file_collector['distinct_taxes_file']):
            for line in open(file_collector['distinct_taxes_file'],'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab    
                qTaxonomy = "insert ignore into %s (taxon_string,rank,num_kids)\
                            VALUES('%s','%s','%s')" \
                            % (taxonomy_table, line[0],line[1],line[2])
                #myconn.execute_no_fetch(qTaxonomy)
                cursor.execute(qTaxonomy)
        else:
            print "distinct taxes file not found for dataset "+key
        #
        #  SEQUENCES
        #
        print "loading "+key+": sequences"
        if os.path.exists(file_collector['sequences_file']):
            for line in open(file_collector['sequences_file'],'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab
                # project dataset taxonomy        refhvr_ids	rank    seq_count frequency  distance  read_id project_dataset    
                qSequences = "insert ignore into %s (sequence,project, dataset, taxonomy,refhvr_ids,rank,seq_count,frequency,distance,rep_id, project_dataset)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (sequences_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8],line[9],line[10])
                #myconn.execute_no_fetch(qSequences) 
                cursor.execute(qSequences)
        else:
            print "sequences file not found for dataset "+key    
        #
        #  EXPORT
        #
        print "loading "+key+": export"
        if os.path.exists(file_collector['export_file']):
            for line in open(file_collector['export_file'],'r'):
                line = line.strip().split("\t")
                if line[0]=='HEADER':
                    continue
                #line = line[1:] # remove leading empty tab
                # t.read_id, t.project, t.dataset, g.refhvr_ids, x.distance, x.taxonomy, t.sequence, x.rank," " t.entry_date
                # project dataset taxonomy        refhvr_ids	rank    seq_count frequency  distance  read_id project_dataset    
                qSequences = "insert ignore into %s (read_id, project, dataset, refhvr_ids, distance, taxonomy, sequence, rank, date_trimmed)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (export_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8])
                #myconn.execute_no_fetch(qSequences) 
                cursor.execute(qSequences)
        else:
            print "export file not found for dataset "+key      
        #
        #  PROJECTS_DATASETS
        #
        print "loading "+key+": projects_datasets"
        if os.path.exists(file_collector['projects_datasets_file']):
            for line in open(file_collector['projects_datasets_file'],'r'):
                line = line.strip().split("\t")
                # [1:]  # split and remove the leading 'zero'
                if line[0]=='HEADER':
                    continue
                
                qDatasets = "insert ignore into %s (project, dataset, dataset_count,has_tax,date_trimmed,dataset_info)\
                            VALUES('%s','%s','%s','%s','%s','%s')" \
                            % (datasets_table,
                            line[0],line[1],line[2],line[3],line[4],line[5])
                #myconn.execute_no_fetch(qDatasets) 
                cursor.execute(qDatasets)
                
                qDatasets = "update %s set has_tax='1' where project='%s'" \
                            % (datasets_table, line[0])
                #myconn.execute_no_fetch(qDatasets)
                cursor.execute(qDatasets)
        else:
            print "project_datasets file not found for dataset "+key      
        #
        # INFO
        #
        print "loading "+key+": info"
        if os.path.exists(file_collector['project_info_file']):
            for line in open(file_collector['project_info_file'],'r'):
                line = line.strip().split("\t")
                #[1:]  # split on tab and remove the leading 'zero'
                if line[0]=='HEADER':
                    continue
                
                qInfo = "insert ignore into %s (project_name, title, description, contact, email, institution, user, env_source_id)\
                            VALUES('%s','%s','%s','%s','%s','%s','%s','%s')" \
                            % (users_info_table,
                            line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7])
                #myconn.execute_no_fetch(qInfo) 
                cursor.execute(qInfo)
                
                qInfo = "update %s set has_tax='1' where project_name='%s'" \
                            % (users_info_table, line[0])
                #myconn.execute_no_fetch(qInfo) 
                cursor.execute(qInfo)
        else:
            print "upload_info file not found for dataset "+key          
        #
        # USERS
        #
        print "loading users:"+key
        
        qUser = "******" \
                    % (users_table, project, user)
        #myconn.execute_no_fetch(qUser) 
        cursor.execute(qUser)
             
            
        logger.info("Finished load VAMPS data")
        
        self.conn.commit()
        cursor.close()
        
    def trim_by_quality(self, infile=None,
                        format='sanger',        wsize=1,        wstep=1,            trim_ends='53',
                        agg_action='min',       exc_count=0,    score_comp='>=',    qual_score=0,
                        filter_first50=False,   filter_Ns=False,filter_Nx=0,        failed_fastq=False,
                        length=0,               trim=0,         clip=0,             keep_zero_length=False):
        #format
        window_size         = wsize
        window_step         = wstep
        #trim_ends
        aggregation_action  = agg_action
        exclude_count       = exc_count
        score_comparison    = score_comp
        quality_score       = qual_score
        filter_length       = length
        trim_length         = trim
        clip_length         = clip
        if not infile:
            sys.exit( "illumina_fastq_trimmer: Need to specify an input file" )

        if window_size < 1:
            sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive window size' )

        if window_step < 1:
            sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive step size' )

        print("\nRunning illumina Filtering")

        in_filepath = os.path.join(self.indir,infile)
        try:
            filebase    = infile.split('/')[1].split('.')[0]
        except:
            filebase    = infile.split('.')[0]

        out_filename    = filebase+".filtered.fastq"
        out_filepath    = os.path.join(self.outdir, out_filename)




        #determine an exhaustive list of window indexes that can be excluded from aggregation
        exclude_window_indexes = []
        last_exclude_indexes = []
        for exclude_count in range( min( exclude_count, window_size ) ):
            if last_exclude_indexes:
                new_exclude_indexes = []
                for exclude_list in last_exclude_indexes:
                    for window_index in range( window_size ):
                        if window_index not in exclude_list:
                            new_exclude = sorted( exclude_list + [ window_index ] )
                            if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                                new_exclude_indexes.append( new_exclude )
                exclude_window_indexes += new_exclude_indexes
                last_exclude_indexes = new_exclude_indexes
            else:
                for window_index in range( window_size ):
                    last_exclude_indexes.append( [ window_index ] )
                exclude_window_indexes = list( last_exclude_indexes )
        out = fastqWriter( open( out_filepath, 'wb' ), format = format )
        action = ACTION_METHODS[ aggregation_action ]
        if failed_fastq:
            fail = fastqWriter( open( out_filepath+'.failed', 'wb' ), format = format )
        num_reads = None
        num_reads_excluded = 0
        count_of_unchaste = 0
        count_of_trimmed  = 0
        count_of_first50  = 0
        count_of_Ns  = 0
        if self.runobj.compressed:
            import gzip
            try:
                logger.info( "illumina_filtering: opening compressed file: "+in_filepath)
                fp = gzip.open( in_filepath )
            except:
                logger.info( "illumina_filtering: opening uncompressed file: "+in_filepath)
                fp = open( in_filepath )
        else:
            logger.info(  "illumina_filtering: opening uncompressed file: "+in_filepath)
            fp = open( in_filepath )
        for num_reads, fastq_read in enumerate( fastqReader( fp, format = format ) ):
            ############################################################################################
            # Put chastity code here
            #print(fastq_read.identifier)
            seq = fastq_read.get_sequence()

            desc_items = fastq_read.identifier.split(':')

            if desc_items[7] == 'Y':
                count_of_unchaste += 1
                #print('failed chastity')
                if failed_fastq:
                    fail.write( fastq_read )
                continue

            # Filter reads with ambiguous bases
            if filter_Ns:
                countN = seq.count('N')
                if countN > 1 or (countN == 1 and seq[filter_Nx-1:filter_Nx] != 'N'):
                    #print('failed Ns',infile)
                    count_of_Ns += 1
                    if failed_fastq:
                        fail.write( fastq_read )
                    continue



            # Filter reads below first 50 base quality
            if filter_first50:
                first50 = 50
                first50_maxQ = 30
                first50_maxQ_count = 34

                quals = fastq_read.get_decimal_quality_scores()[:first50]
                count_lt30 = 0

                for q in quals:
                    if q < first50_maxQ:
                        count_lt30 += 1
                if count_lt30 >= first50_maxQ_count:
                    #print('failed first50')
                    if failed_fastq:
                        fail.write( fastq_read )
                    count_of_first50 += 1
                    continue

            ##### END CHASTITY #####################
            ############################################################################################
            ##### START Btails CODE ################
            quality_list = fastq_read.get_decimal_quality_scores()

            for trim_end in trim_ends:


                if trim_end == '5':
                    lwindow_position = 0 #left position of window
                    while True:
                        if lwindow_position >= len( quality_list ):
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + window_size ], score_comparison, quality_score, exclude_window_indexes ):
                            fastq_read = fastq_read.slice( lwindow_position, None )
                            break
                        lwindow_position += window_step
                else:
                    rwindow_position = len( quality_list ) #right position of window
                    while True:
                        lwindow_position = rwindow_position - window_size #left position of window
                        if rwindow_position <= 0 or lwindow_position < 0:
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], score_comparison, quality_score, exclude_window_indexes ):
                            fastq_read = fastq_read.slice( None, rwindow_position )
                            break
                        rwindow_position -= window_step

            ######## END Btails CODE ###############################
            ############################################################################################
            # put  length/trim/clip code here
            quality_list = fastq_read.get_decimal_quality_scores()

            if filter_length:
                if len(quality_list) < filter_length:
                    print('failed length')
                    if failed_fastq:
                        fail.write( fastq_read )
                    continue

            # Trim initial bases -- remove first 10 bases from read 2
            if clip_length:
                # remove from the front:
                fastq_read = fastq_read.slice( clip_length, None )
                count_of_trimmed += 1

            # Trim to max length -- read 2 trim to 90.
            if trim_length:
                if len(quality_list) > trim_length:
                    # remove from the end:
                    fastq_read = fastq_read.slice( None, len(fastq_read.get_sequence()) - trim_length )
                    count_of_trimmed += 1


            if keep_zero_length or len( fastq_read ):
                out.write( fastq_read )
            else:
                num_reads_excluded += 1
        out.close()
        if failed_fastq:
            fail.close()
        print("file:",infile)
        print('count_of_trimmed             (for length):', count_of_trimmed)
        print('count_of_first50 (avg first50 quals < 34):', count_of_first50)
        print("count_of_unchaste             ('Y' in id):", count_of_unchaste)
        print('count_of_Ns                (reads with N):', count_of_Ns)
        if num_reads is None:
            print("No valid FASTQ reads could be processed.")
        else:
            print("%i FASTQ reads were processed." % ( num_reads + 1 ))
        if num_reads_excluded:
            print("%i reads of zero length were excluded from the output." % num_reads_excluded)

        return out_filename
                                                 help="user name")  
 
 ## optional       
 parser.add_argument("-nd", "--no_distal",       required=False,  action='store_false', dest = "require_distal", 
                                                 default=True,    help="") 
 parser.add_argument('-min',"--minlength",       required=False, action="store",   dest = "minlength", 
                                                 help = '')                                                 
 parser.add_argument("-max","--maxlength",         required=False,  action="store",   dest = "maxlength", 
                                                 help="")             
 parser.add_argument("-file_type",               required=True,  action="store",   dest = "file_type", default='fasta',
                                                 help="sff, fasta or fastq")                                                     
 parser.add_argument('-file_base',               required=True, action="store",   dest = "file_base", 
                                                 help = '') 
 parser.add_argument("-cl", "--use_cluster",     required=False,  action="store",   dest = "use_cluster", default=True,
                                                     help = '')                                                     
 logger.info("Starting vamps_trim.py")
 args = parser.parse_args()
 
 data_object['datetime'] = str(datetime.date.today())
 
 data_object['runcode'] = args.runcode
 data_object['site'] =   args.site
 data_object['user'] =   args.user
 data_object['require_distal'] = args.require_distal
 data_object['use_cluster']      = args.use_cluster
 if data_object['use_cluster'] == 'True' or data_object['use_cluster'] ==  'true':
     data_object['use_cluster'] = True
 else:
     data_object['use_cluster'] = False
 if args.minlength:
     minlength = args.minlength
Exemple #34
0
        sys.exit("unknown platform - Exiting")

    v = MetadataUtils(command_line_args=args)

    # this will read the args and ini file and return a dictionary

    data_object = v.validate_args()
    #    for attr in dir(data_object):
    #        print("obj.%s = %s" % (attr, getattr(data_object, attr)))

    # set logging

    print("\nLog Level set to:", args.loglevel)
    logger.setLevel(args.loglevel.upper())

    logger.info("Starting pipeline")
    ##############
    #
    #  Test cl parameters
    #
    ##############
    # CL RULES:
    # for ini file:  (no plurals)
    # 1) CL: input_dir ONLY shall be supplied on CL - no input filenames
    #
    # 2) All input files should be in the same directory AND of the same format
    #
    # 3) Supply a input_file_suffix on the CL if there are varying file types in the
    #       input_dir and you only are using some (default will read all files)
    # 4)
    #
 def gast_cleanup(self):
     """
     gast_cleanup - follows clustergast, explodes the data and copies to gast_concat and gast files
     """
     logger.info("Starting GAST Cleanup")
     self.runobj.run_status_file_h.write("Starting gast_cleanup\n")
     for key in self.idx_keys:
         output_dir = os.path.join(self.basedir,key)
         gast_dir = os.path.join(output_dir,'gast')
         if key in self.runobj.samples:
             dna_region = self.runobj.samples[key].dna_region
         else:            
             dna_region = self.runobj.dna_region
         if not dna_region:
             logger.error("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'")
             self.runobj.run_status_file_h.write("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'\n")
             dna_region = 'unknown'
         # find gast_dir
         
         
         # for vamps user upload
         # basedir is like avoorhis_3453211
         # and outdir is like avoorhis_3453211/2012-06-25
         # for MBL pipeline
         # basedir is like 1_AGTCG
         # and outdir is like 1_AGTCG/2012-06-25
         unique_file = 'Not Found'
         names_file  = 'Not Found'
         if self.runobj.platform == 'vamps':    
             unique_file = os.path.join(output_dir, key+'.unique.fa')
             names_file = os.path.join(output_dir,key+'.names')
         elif self.runobj.platform == 'illumina':
             file_prefix = self.runobj.samples[key].file_prefix
             unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique")
             names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names")
         else:
             pass
         print 'UNIQUE FILE',unique_file
         
         
         #print 'names file',names_file
         
         if not os.path.exists(gast_dir):
             logger.error("Could not find gast directory: "+gast_dir+" Exiting")
             sys.exit()
         clustergast_filename_single   = os.path.join(gast_dir, "gast"+dna_region)
         
         logger.debug('gast filesize:'+str(os.path.getsize(clustergast_filename_single)))
         
         gast_filename          = os.path.join(gast_dir, "gast")
         gastconcat_filename    = os.path.join(gast_dir, "gast_concat")  
         #dupes_filename    = os.path.join(gast_dir, "dupes") 
         #nonhits_filename    = os.path.join(gast_dir, "nonhits")   
         copies = {}
         nonhits = {}
         # open and read names file
         names_fh = open(names_file,'r')
         for line in names_fh:
             s = line.strip().split("\t")
             
             index_read = s[0]                
             copies[index_read] = s[1].split(',')
             
             if index_read in nonhits:
                 nonhits[index_read] += 1
             else:
                 nonhits[index_read] = 1
                 
             
             
         names_fh.close()            
         #print nonhits
         #print copies
         
         #######################################
         # 
         #  Insert records with valid gast hits into gast_file
         # 
         #######################################   
         # read the .gast file from clustergast            
         concat = {}
         gast_fh     = open(gast_filename,'w')
         if(os.path.exists(clustergast_filename_single)):
             in_gast_fh  = open(clustergast_filename_single,'r')
         else:
             print "No clustergast file found:",clustergast_filename_single,"\nExiting"
             self.runobj.run_status_file_h.write("No clustergast file found:",clustergast_filename_single," Exiting\n")
             sys.exit()
         for line in in_gast_fh:
             
             s = line.strip().split()
             if len(s) == 4:
                 read_id     = s[0]
                 refhvr_id   = s[1].split('|')[0]
                 distance    = s[2]
                 alignment   = s[3]
             #print read_id,refhvr_id
             # if this was in the gast table zero it out because it had a valid hit
             # so we don't insert them as non-hits later
             if read_id in nonhits:
                 del nonhits[read_id]
                 #print 'deleling',read_id
             #print 'nonhits',nonhits
             if read_id not in copies:
                 logger.info(read_id+' not in names file: Skipping')
                 continue
                 
             # give the same ref and dist for each duplicate
             for id in copies[read_id]:
                 
                 if id != read_id:
                     #print id,read_id,distance,refhvr_id  
                     gast_fh.write( id + "\t" + refhvr_id + "\t" + distance + "\t" + alignment + "\n" )
                     
                                            
         in_gast_fh.close()
          
         #######################################
         # 
         #  Insert a record for any valid sequence that had no blast hit and therefore no gast result
         #       into gast_filename
         # 
         #######################################   
         for read in sorted(nonhits.iterkeys()):                
             for d in copies[read]: 
                 gast_fh.write( d+"\t0\t1\t\n")
                 
                 
         gast_fh.close()
         
         # concatenate the two gast files
         clustergast_fh = open(clustergast_filename_single,'a')            
         shutil.copyfileobj(open(gast_filename,'rb'), clustergast_fh)
         clustergast_fh.close()
         #the open again and get data for gast concat
         concat = {}
         print clustergast_filename_single
         for line in open(clustergast_filename_single,'r'):
             data = line.strip().split("\t")
             id = data[0]
             refhvr_id = data[1].split('|')[0]
             distance = data[2]
             #print 'data',data
             if id in concat:
                 concat[id]['refhvrs'].append(refhvr_id)                        
             else:
                 concat[id] = {}
                 concat[id]['refhvrs'] = [refhvr_id]
             concat[id]['distance'] = distance     
             
         
         
         #######################################
         #
         # Insert records into gast_concat_filename
         #
         #######################################             
         # first we need to open the gast_filename
         gastconcat_fh     = open(gastconcat_filename,'w')
         for id, value in concat.iteritems():
             #print 'trying gastconcat', id,value
             gastconcat_fh.write( id + "\t" + concat[id]['distance'] + "\t" + ' '.join(concat[id]['refhvrs']) + "\n" )
         gastconcat_fh.close()
         
         
     print "Finished gast_cleanup"   
     logger.info("Finished gast_cleanup")
     return ("SUCCESS","gast_cleanup")
    def taxonomy(self,key, dataset_count, file_collector):
        """
        fill vamps_data_cube, vamps_junk_data_cube and vamps_taxonomy files
        """
        logger.info("Starting vamps_upload: taxonomy")
        print "Starting vamps_upload: taxonomy"
        # SUMMED create a look-up
        if self.runobj.vamps_user_upload:
            project = self.runobj.project
            dataset = key
        else:
            if self.runobj.platform == 'illumina':
                project = self.runobj.samples[key].project
                dataset = self.runobj.samples[key].dataset
            elif self.runobj.platform == '454':
                pass
            else:
                pass
            
        project = project[0].capitalize() + project[1:]
        project_dataset = project+'--'+dataset
        taxa_lookup = {}
        read_id_lookup={}
        if os.path.exists(file_collector['tagtax_file']):
            for line in  open(file_collector['tagtax_file'],'r'):
                line = line.strip()
                items = line.split("\t")
                taxa = items[1]
                if taxa[-3:] == ';NA':
                    taxa = taxa[:-3]
                read_id=items[0]
                read_id_lookup[read_id]=taxa
                
                # the count here is the frequency of the taxon in the datasets
                if taxa in taxa_lookup:                
                    taxa_lookup[taxa] += 1 
                else:
                    taxa_lookup[taxa] = 1 
                      
        #  DATA CUBE TABLE
        # taxa_lookup: {'Unknown': 146, 'Bacteria': 11888, 'Bacteria;Chloroflexi': 101}
        # dataset_count is 3 (3 taxa in this dataset)
        # frequency is 3/144
        fh1 = open(file_collector['taxes_file'],'w')
        
        
        fh1.write("\t".join( ["HEADER","project", "dataset", "taxonomy", "superkingdom", 
                            "phylum", "class", "orderx", "family", "genus", "species", 
                            "strain", "rank", "knt", "frequency", "dataset_count", "classifier"]) + "\n")
        tax_collector={}
        summer=0
        for tax,knt in taxa_lookup.iteritems():
            #print tax,cnt
            summer += knt
            datarow = ['',project,dataset]
            
            taxa = tax.split(';')
            #if taxa[0] in C.domains:
            freq = float(knt) / int(dataset_count)
            rank = C.ranks[len(taxa)-1]
            for i in range(len(C.ranks)):                
                if len(taxa) <= i:
                    taxa.append(C.ranks[i] + "_NA")

            tax_collector[tax] = {}


            datarow.append(tax)
            datarow.append("\t".join(taxa))
            datarow.append(rank)
            datarow.append(str(knt))
            datarow.append(str(freq))
            datarow.append(dataset_count)
            datarow.append(self.runobj.classifier)
            
            w = "\t".join(datarow)
            #print w
            fh1.write(w+"\n")
           
            tax_collector[tax]['rank'] = rank
            tax_collector[tax]['knt'] = knt
            tax_collector[tax]['freq'] = freq
        
        fh1.close()
        
        #
        # SUMMED DATA CUBE TABLE
        #
        fh2 = open(file_collector['summed_taxes_file'],'w')
        
        fh2.write("\t".join(["HEADER","taxonomy", "sum_tax_counts", "frequency", "dataset_count","rank", 
                            "project","dataset","project--dataset","classifier"] )+"\n")
        ranks_subarray = []
        rank_list_lookup = {}
        for i in range(0, len(C.ranks)): 
            ranks_subarray.append(C.ranks[i])
            ranks_list = ";".join(ranks_subarray) # i.e., superkingdom, phylum, class
            # open data_cube file again
            # taxes_file: data_cube_uploads
            for line in  open(file_collector['taxes_file'],'r'):
                line = line.strip().split("\t")
                knt = line[12]
                taxon = line[2]
                if line[0] == 'HEADER':
                    continue
                if taxon in tax_collector:
                    knt = tax_collector[taxon]['knt']
                else:
                    print 'ERROR tax not found in tax_collector: assigning zero'
                    knt = 0
                idx = len(ranks_subarray)
                l=[]
                for k in range(3,idx+3):                    
                    l.append(line[k])
                tax = ';'.join(l)
                #print 'rl tax',ranks_list,tax
                
                
                if tax in rank_list_lookup:
                    rank_list_lookup[tax] += knt
                else:
                    rank_list_lookup[tax] = knt
                    
                
          
        for tax,knt in rank_list_lookup.iteritems():
            
           
            
            #print 'tax2',tax
            taxa = tax.split(';')
            #if taxa[0] in C.domains:
            rank = len( taxa ) -1
            
            frequency = float(knt) / int(dataset_count)
            
            if len(tax) - len(''.join(taxa)) >= rank:
            
                datarow = ['']
                datarow.append(tax)
                datarow.append(str(knt))
                datarow.append(str(frequency))
                datarow.append(str(dataset_count))
                datarow.append(str(rank))
                datarow.append(project)
                datarow.append(dataset)
                datarow.append(project_dataset)
                datarow.append(self.runobj.classifier)
            
                w = "\t".join(datarow)
                #print w
                fh2.write(w+"\n")
                

        fh2.close()
        
        
                
        #
        # DISTINCT TAXONOMY
        #
        fh3 = open(file_collector['distinct_taxes_file'],'w')
        fh3.write("\t".join(["HEADER","taxon_string", "rank", "num_kids"] )+"\n")
        taxon_string_lookup={}
        for line in  open(file_collector['summed_taxes_file'],'r'):
            if line.split()[0] == 'HEADER':
                continue
            items = line.strip().split()            
            taxon_string = items[0]
            #print taxon_string
            if taxon_string in taxon_string_lookup:
                taxon_string_lookup[taxon_string] += 1
            else:
                taxon_string_lookup[taxon_string] = 1
        
        for taxon_string,v in taxon_string_lookup.iteritems():
            datarow = ['']
            datarow.append(taxon_string)
            taxa = taxon_string.split(';')
            if taxa[0] in C.domains:
                rank = str(len(taxa)-1)
                datarow.append(rank)
                if rank==7 or taxon_string[-3:]=='_NA':
                    num_kids = '0'
                else:
                    num_kids = '1'
                datarow.append(num_kids)
                w = "\t".join(datarow)
                #print 'w',w
                fh3.write(w+"\n")
        fh3.close()
        
        return (tax_collector,read_id_lookup)
    def clustergast(self):
        """
        clustergast - runs the GAST pipeline on the cluster.
               GAST uses UClust to identify the best matches of a read sequence
               to references sequences in a reference database.
               VAMPS: The uniques and names files have previously been created in trim_run.py.
               Illumina :
        """
        logger.info("Starting Clustergast")
        self.runobj.run_status_file_h.write("Starting clustergast\n")
        # Step1: create empty gast table in database: gast_<rundate>
        # Step2: Count the number of sequences so the job can be split for nodes
        # $facount = `grep -c \">\" $fasta_uniqs_filename`;
        # $calcs = `/bioware/seqinfo/bin/calcnodes -t $facount -n $nodes -f 1`;

        #   /bioware/seqinfo/bin/fastasampler -n $start,$end ${gastDir}/${fasta_uniqs_filename} $tmp_fasta_filename
        #   $usearch_binary --global --query $tmp_fasta_filename --iddef 3 --gapopen 6I/1E --db $refhvr_fa --uc $tmp_usearch_filename --maxaccepts $max_accepts --maxrejects $max_rejects --id $pctid_threshold
        #   # sort the results for valid hits, saving only the ids and pct identity
        #   grep -P \"^H\\t\" $tmp_usearch_filename | sed -e 's/|.*\$//' | awk '{print \$9 \"\\t\" \$4 \"\\t\" \$10 \"\\t\" \$8}' | sort -k1,1b -k2,2gr | clustergast_tophit > $gast_filename
        #   Submit the script
        #   /usr/local/sge/bin/lx24-amd64/qsub $qsub_priority $script_filename
 
        
        
        calcnodes = C.calcnodes_cmd
        sqlImportCommand = C.mysqlimport_cmd
        #qsub = '/usr/local/sge/bin/lx24-amd64/qsub'
        clusterize = C.clusterize_cmd
        


        ###################################################################
        # use fasta.uniques file
        # split into smaller files
        # usearch --cluster each
        #######################################
        #
        # Split the uniques fasta and run UClust per node
        #
        #######################################
        qsub_prefix = 'clustergast_sub_'
        gast_prefix = 'gast_'
        if self.use_cluster:
            logger.info("Using cluster for clustergast")
        else:
            logger.info("Not using cluster")
        counter=0
        for key in self.idx_keys:
            print key
            counter +=1
            print "\nFile:",str(counter)
            if counter >= self.limit:
                pass
            
            cluster_nodes = C.cluster_nodes
            logger.info("Cluster nodes set to: "+str(cluster_nodes))
            output_dir = os.path.join(self.basedir,key)
            gast_dir = os.path.join(output_dir,'gast')
  # SMPL1_3_NNNNCGCTC_3          
            #print 'samples',key,self.runobj.samples
            if key in self.runobj.samples:
                dna_region = self.runobj.samples[key].dna_region
            else:            
                dna_region = self.runobj.dna_region
            if not dna_region:
                logger.error("clustergast: We have no DNA Region: Setting dna_region to 'unknown'")
                dna_region = 'unknown'
                
            (refdb,taxdb) = self.get_reference_databases(dna_region)
            #print 'DBs',refdb,taxdb
            
            # if no dna_region OR no refdb can be found then use
            # refssu
            #if refdb contains refssu
            #the add this to grep command
            #and change usearch to usearch64
            unique_file = 'Not Found'
            names_file  = 'Not Found'
            if self.runobj.platform == 'vamps':    
                unique_file = os.path.join(output_dir, key+'.unique.fa')
            elif self.runobj.platform == 'illumina':
                file_prefix = self.runobj.samples[key].file_prefix
                unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique")
                names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names")
                
            else:
                pass
                
            print 'UNIQUE FILE',unique_file

            #print gast_dir
            #sys.exit("EXIT")
            
            
            i = 0
            if cluster_nodes:
                grep_cmd = ['grep','-c','>',unique_file]
                logger.debug( ' '.join(grep_cmd) )
                facount = subprocess.check_output(grep_cmd).strip()
                logger.debug( key+' count '+facount)
                calcnode_cmd = [calcnodes,'-t',str(facount),'-n',str(cluster_nodes),'-f','1']
                
                calcout = subprocess.check_output(calcnode_cmd).strip()
                logger.debug("calcout:\n"+calcout)
                #calcout:
                # node=1 start=1 end=1 rows=1
                # node=2 start=2 end=2 rows=1
                # node=3 start=3 end=3 rows=1           
                lines = calcout.split("\n")
                gast_file_list = []
                for line in lines:
                    i += 1
                    if i >= cluster_nodes:
                        continue
                    script_filename = os.path.join(gast_dir,qsub_prefix + str(i))
                    gast_filename   = os.path.join(gast_dir, gast_prefix + str(i))
                    fastasamp_filename = os.path.join(gast_dir, 'samp_' + str(i))
                    clustergast_filename   = os.path.join(gast_dir, key+".gast_" + str(i))
                    gast_file_list.append(clustergast_filename)
                    usearch_filename= os.path.join(gast_dir, "uc_" + str(i))
                    log_file = os.path.join(gast_dir, 'clustergast.log_' + str(i))
                    
                    data = line.split()
                    
                    if len(data) < 2:
                        continue
                    start = data[1].split('=')[1]
                    end  = data[2].split('=')[1]
                    
                    if self.use_cluster:
                        fh = open(script_filename,'w')
                        qstat_name = "gast" + key + '_' + self.runobj.run + "_" + str(i)
                        fh.write("#!/bin/csh\n")
                        fh.write("#$ -j y\n" )
                        fh.write("#$ -o " + log_file + "\n")
                        fh.write("#$ -N " + qstat_name + "\n\n")
                        #fh.write("source /xraid/bioware/Modules/etc/profile.modules\n");
                        #fh.write("module load bioware\n\n");
    
                        # setup environment
                        fh.write("source /xraid/bioware/Modules/etc/profile.modules\n")
                        fh.write("module load bioware\n\n")
                    
                    cmd1 = self.get_fastasampler_cmd(unique_file, fastasamp_filename,start,end)
                    

                    logger.debug("fastasampler command: "+cmd1)
                    
                    if self.use_cluster:
                        fh.write(cmd1 + "\n")
                    else:
                        subprocess.call(cmd1,shell=True)
                    
                    cmd2 = self.get_usearch_cmd(fastasamp_filename, refdb, usearch_filename)

                    logger.debug("usearch command: "+cmd2)
                    print 'usearch',cmd2
                    if self.use_cluster:
                        fh.write(cmd2 + "\n")
                    else:
                        subprocess.call(cmd2,shell=True)
                    
                    cmd3 = self.get_grep_cmd(usearch_filename, clustergast_filename)

                    logger.debug("grep command: "+cmd3)
                    if self.use_cluster:                
                        fh.write(cmd3 + "\n")
                        fh.close()
                        
                        # make script executable and run it
                        os.chmod(script_filename, stat.S_IRWXU)
                        qsub_cmd = clusterize + " " + script_filename
                        
                        # on vamps and vampsdev qsub cannot be run - unless you call it from the
                        # cluster aware directories /xraid2-2/vampsweb/vamps and /xraid2-2/vampsweb/vampsdev
                        qsub_cmd = C.qsub_cmd + " " + script_filename
                        logger.debug("qsub command: "+qsub_cmd)
                        
                        #subprocess.call(qsub_cmd, shell=True)
                        proc = subprocess.Popen(qsub_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                        # proc.communicate will block - probably not what we want
                        #(stdout, stderr) = proc.communicate() #block the last onehere
                        #print stderr,stdout
    
                    else:
                        subprocess.call(cmd3,shell=True)
                        print cmd3
            
            else:
                #fastasamp_filename = os.path.join(gast_dir, 'samp')
                usearch_filename= os.path.join(gast_dir, "uc")
                clustergast_filename_single   = os.path.join(gast_dir, "gast"+dna_region)
                print usearch_filename,clustergast_filename_single
                cmd1 = self.get_usearch_cmd(unique_file,refdb,usearch_filename)
                print cmd1
                subprocess.call(cmd1,shell=True)
                cmd2 = self.get_grep_cmd(usearch_filename, clustergast_filename_single)
                print cmd2
                subprocess.call(cmd2,shell=True)
                
            if self.use_cluster:
                # wait here for all the clustergast scripts to finish
                temp_file_list = gast_file_list
            
                c = False
                maxwaittime = C.maxwaittime  # seconds
                sleeptime   = C.sleeptime    # seconds
                counter = 0
                while c == False:
                    counter += 1
                    if counter >= maxwaittime / sleeptime:
                        raise Exception("Max wait time exceeded in gast.py")
                    for index, file in enumerate(temp_file_list):
                        #print temp_file_list
                        if os.path.exists(file) and os.path.getsize(file) > 0:
                            # remove from tmp list
                            logger.debug("Found file now removing from list: "+file)
                            temp_file_list = temp_file_list[:index] + temp_file_list[index+1:]
                    
                    if temp_file_list:
                        logger.info("waiting for clustergast files to fill...")
                        logger.debug(' '.join(temp_file_list))
                        logger.info("\ttime: "+str(counter * sleeptime)+" | files left: "+str(len(temp_file_list)))
                        time.sleep(sleeptime)
                    else:
                        c = True
                    
            # now concatenate all the clustergast_files into one file (if they were split)
            if cluster_nodes:
                # gast file
                clustergast_filename_single   = os.path.join(gast_dir, "gast"+dna_region)
                clustergast_fh = open(clustergast_filename_single,'w')
                # have to turn off cluster above to be able to 'find' these files for concatenation
                for n in range(1,i-1):
                    #cmd = "cat "+ gast_dir + key+".gast_" + str(n) + " >> " + gast_dir + key+".gast"
                    file = os.path.join(gast_dir, key+".gast_" + str(n))
                    if(os.path.exists(file)):                    
                        shutil.copyfileobj(open(file,'rb'), clustergast_fh)
                    else:
                        logger.info( "Could not find file: "+os.path.basename(file)+" Skipping")

                clustergast_fh.flush()
                clustergast_fh.close()
            
        if not self.test:    
            # remove tmp files
            for n in range(i+1):
                #print "Trying to remove "+os.path.join(gast_dir,"uc_"+str(n))
                if os.path.exists(os.path.join(gast_dir,"uc_"+str(n))):
                    os.remove(os.path.join(gast_dir,"uc_"+str(n)))
                    pass
                #print "Trying to remove "+os.path.join(gast_dir,"samp_"+str(n))
                if os.path.exists(os.path.join(gast_dir,"samp_"+str(n))):    
                    os.remove(os.path.join(gast_dir,"samp_"+str(n)))
                    pass
                #print "Trying to remove "+os.path.join(self.gast_dir,key+".gast_"+str(n))
                if os.path.exists(os.path.join(gast_dir,key+".gast_"+str(n))):    
                    os.remove(os.path.join(gast_dir,key+".gast_"+str(n)))
                    pass
                    
                    
        
        print "Finished clustergast"
        logger.info("Finished clustergast")
        return ("SUCCESS","Clustergast")
 def sequences(self,key,tax_collector, read_id_lookup, file_collector):
     """
     fill vamps_sequences.txt file
     
     """
     
     logger.info("Starting vamps_upload: sequences")
     print "Starting vamps_upload: sequences"
     if self.runobj.vamps_user_upload:
         project = self.runobj.project
         dataset = key
     else:
         if self.runobj.platform == 'illumina':
             project = self.runobj.samples[key].project
             dataset = self.runobj.samples[key].dataset
         elif self.runobj.platform == '454':
             pass
         else:
             pass
     
     project = project[0].capitalize() + project[1:]
     project_dataset = project+'--'+dataset
     # open gast_concat table to get the distances and the ferids
     refid_collector={}
     #if os.path.exists(gast_concat_file):
     for line in  open(file_collector['gast_concat_file'],'r'):
         line = line.strip()
         items=line.split()
         id=items[0]
         distance=items[1]
         refhvr_ids=items[2]
         refid_collector[id]={}
         refid_collector[id]['distance']=distance
         refid_collector[id]['refhvr_ids']=refhvr_ids
             
         
     
     
     fh = open(file_collector['sequences_file'],'w')
     fh.write("\t".join(["HEADER","project","dataset","taxonomy","refhvr_ids", "rank",
                         "seq_count","frequency","distance","read_id","project_dataset"] )+"\n")
     
     
     
     # open uniques fa file
     if os.path.exists(file_collector['unique_file']) and os.path.getsize(file_collector['unique_file']) > 0:
         f = FastaReader(file_collector['unique_file'])
         
         while f.next():
             datarow = ['']
             id = f.id.split('|')[0]
             seq = f.seq
             if id in read_id_lookup:
                 tax = read_id_lookup[id]
             else: 
                 tax = ''
             if tax in tax_collector:
                 rank = tax_collector[tax]['rank']
                 cnt = tax_collector[tax]['knt']
                 freq = tax_collector[tax]['freq']
             else:
                 rank = 'NA'
                 cnt  = 0
                 freq = 0
             if id in refid_collector:
                 distance = refid_collector[id]['distance']
                 refhvr_ids = refid_collector[id]['refhvr_ids']
             else:
                 distance = '1.0'
                 refhvr_ids = '0'
             
             datarow.append(seq)
             datarow.append(project)
             datarow.append(dataset)
             datarow.append(tax)
             datarow.append(refhvr_ids)
             datarow.append(rank)
             datarow.append(str(cnt))
             datarow.append(str(freq))
             datarow.append(distance)
             datarow.append(id)
             datarow.append(project_dataset)
             w = "\t".join(datarow)
             #print 'w',w
             fh.write(w+"\n")
             
             
         fh.close()
     logger.info("")
     return refid_collector
 
 parser.add_argument('-l', '--loglevel',         required=False,   action="store",          dest = "loglevel",          default='ERROR',       
                                                     help = 'Sets logging level... DEBUG, INFO, WARNING, ERROR, CRITICAL')                                             
                                             
                                             
 steps ='gast,vampsupload'
 #steps ='gast'
 #steps ='vampsupload'
 args = parser.parse_args()
 
 # set logging
 loggerlevel = args.loglevel.upper()
 print "\nLog Level set to:",loggerlevel    
 logger.setLevel(loggerlevel)
 
 logger.info("Starting vamps_gast.py")
 
 # fill command line object
 data_object['datetime'] = str(datetime.date.today())
 data_object['runcode']    = args.runcode
 data_object['site']       = args.site
 data_object['user']       = args.user
 data_object['project']    = args.project[:1].capitalize() + args.project[1:]
 data_object['dataset']    = args.dataset
 data_object['dna_region'] = args.dna_region
 data_object['domain']     = args.domain
 data_object['from_fasta']     = args.from_fasta
 data_object['fasta_file']     = args.fasta_file
 data_object['baseoutputdir'] = args.baseoutputdir
 data_object['output_dir'] = args.output_dir
 data_object['load_db']      = args.load_db
Exemple #40
0
def gast(runobj):  
    
    logger.info("STARTING GAST()")
#     logger.info("vsearch version: " % utils.get_vsearch_version)
    # for vamps 'new_lane_keys' will be prefix 
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    # Should return a list not a string
    idx_keys = get_keys(runobj)
    
    # get GAST object
    mygast = Gast(runobj, idx_keys)
    
    
    # Check for unique files and create them if not there
    result_code = mygast.check_for_unique_files(idx_keys)
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("uniques not found failed")
        sys.exit("uniques not found failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "uniques file not found - failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
        
    sleep(5)
    
    # CLUSTERGAST
    result_code = mygast.clustergast()
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("clutergast failed")
        sys.exit("clustergast failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "clustergast failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
        
    sleep(5)
    
    # GAST_CLEANUP
    result_code = mygast.gast_cleanup()
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast_cleanup failed")        
        sys.exit("gast_cleanup failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast_cleanup failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
        
    sleep(5)
    
    # GAST2TAX
    result_code = mygast.gast2tax()
    runobj.run_status_file_h.write(json.dumps(result_code)+"\n")
    if result_code['status'] == 'ERROR':
        logger.error("gast2tax failed") 
        sys.exit("gast2tax failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast2tax failed" )
    elif runobj.vamps_user_upload:
        write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )