def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) submit_to_test_db=opts.submit_to_test_db fasta_file_paths=opts.fasta_file_paths study_id=opts.study_id output_dir=opts.output_dir analysis_id=opts.analysis_id seq_run_id=opts.seq_run_id user_id=opts.user_id split_lib_md5=opts.split_lib_md5 if submit_to_test_db == 'False': # Load the data into the database data_access = data_access_factory(ServerConfig.data_access_type) else: # Load the data into the database data_access = data_access_factory(DataAccessType.qiime_test) split_library_id=load_split_lib_sequences(data_access,output_dir, analysis_id, seq_run_id, split_lib_md5) print 'Completed database loading.'
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) fasta_file = dict(MinimalFastaParser(open(opts.input_file,'U'))) fname=split(opts.input_file)[-1].split('_') ref_dataset=fname[0] if ref_dataset=='gg': reference_dataset='GREENGENES_REFERENCE' threshold=fname[1] print threshold try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle if opts.submit_to_test_db: data_access = data_access_factory(DataAccessType.qiime_test) else: data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass prokmsas=[] for prok_id in fasta_file: prokmsas.append('%s\t%s\t%s' % (str(prok_id),str(threshold), reference_dataset)) con = data_access.getSFFDatabaseConnection() cur = con.cursor() data_types=['s','i','s'] for input_set in input_set_generator(prokmsas, cur,data_types): valid=data_access.loadSeqToSourceMap(True,input_set) if not valid: raise ValueError, 'Error: Unable to load Sequence to Source Map!'
def export_full_db_to_fasta(output_fasta_name, distinct_list): """ Exports the entire sequence collection to fasta This function exports the entire database to fasta format. It does not care about public/private nor does it depend on any linkages to other metadata. """ output_fasta = open(output_fasta_name, 'w') data_access = data_access_factory(ServerConfig.data_access_type) seqs = data_access.getSequencesFullDatabase() md5s = [] for seq in seqs: sequence_name, sequence_string, md5_checksum = seq[0], seq[1], seq[2] if distinct_list: if md5_checksum not in md5s: md5s.append(md5_checksum) output_fasta.write('>%s\n%s\n' % (sequence_name, sequence_string)) print 'Exporting sequence: %s' % sequence_name else: print 'Duplicate checksum found for sequence name: %s. Skipping...' % sequence_name else: #output_fasta.write('>%s\n%s\n' % (sequence_name, sequence_string)) print 'Exporting sequence: %s' % sequence_name
def __init__(self, *args, **kwargs): self.data_access=data_access_factory(ServerConfig.data_access_type) self.username = os.environ['USER'] self.home = os.environ['HOME'] self.Jobs = {} # pbs job id -> job object self.interval = 0 super(Poller, self).__init__(*args, **kwargs)
def export_fasta_from_sample(study_id, sample_id, output_fasta): """ Exports all sequences for the supplied sample_id This function exports all sequences associated to the given study_id. Note that the sequences must directly map to the sample_id supplied. """ # If name passed is a string, open the file. Otherwise ignore as the file # has already been opened by the parent file_opened_here = False if isinstance(output_fasta, str): output_fasta = open(output_fasta, "w") file_opened_here = True # Get our copy of data_access data_access = data_access_factory(ServerConfig.data_access_type) seqs = data_access.getSequencesFromSample(study_id, sample_id) # print '------------------------------ Seqs for sample ID: %s' % str(sample_id) for seq in seqs: output_fasta.write(">%s\n%s\n" % (seq, seqs[seq])) # print seq # Close the file if opened in this function if file_opened_here: output_fasta.close()
def export_fasta_from_study(study_id, output_fasta): """ Exports a fasta file for all sequences found in the supplied study This function exports all sequences associated to the given study id. Note that the sequences must map to a sample in the study to be exported. """ # If name passed is a string, open the file. Otherwise ignore as the file # has already been opened by the parent file_opened_here = False if isinstance(output_fasta, str): output_fasta = open(output_fasta, "w") file_opened_here = True # Get our copy of data access data_access = data_access_factory(ServerConfig.data_access_type) # Get all samples for this study sample_ids = data_access.getSampleIDsFromStudy(study_id) for sample_id in sample_ids: export_fasta_from_sample(study_id, sample_id, output_fasta) # Close the file if opened in this function if file_opened_here: output_fasta.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass query_dict=eval(open(opts.query).read()) table_col_value={} for i in query_dict: if i not in ['otu_table','mapping_file','pcoa_plot']: table_col_value[i]=query_dict[i] fs_fp=opts.fs_fp web_fp=opts.web_fp file_name_prefix=opts.fname_prefix user_id=int(opts.user_id) meta_id=int(opts.meta_id) beta_metric=opts.beta_metric rarefied_at=int(opts.rarefied_at) write_mapping_and_pcoa_plots(data_access, table_col_value, fs_fp, web_fp, file_name_prefix,user_id,meta_id,beta_metric,rarefied_at)
def get_ag_metadata_bulk(barcodes, participant_type): """Calls ag_get_barcode_metadata on a list of barcodes barcodes should be an iterable list of barcodes (or an open file that has one barcode per line) participant_type should be either 'human' or 'animal' """ if participant_type not in ('human', 'animal'): raise ValueError("participant_type must be either 'human' or 'animal'") ag_data_access = data_access_factory(ServerConfig.data_access_type, 'american_gut') results = [] for line in barcodes: bc = line.strip() if participant_type == 'human': metadata = ag_data_access.AGGetBarcodeMetadata(bc) else: # participant_type == 'animal' metadata = ag_data_access.AGGetBarcodeMetadataAnimal(bc) if len(metadata) != 1: yield False, bc else: yield True, metadata[0]
def __init__(self, metadataFile, study_id): self._invalid_rows = [] self._columns = [] self._log = [] self._metadataFile = metadataFile self._data_access = data_access_factory(ServerConfig.data_access_type) self._study_id = study_id
def __init__(self, study_id, web_app_user_id, logger): self.data_access = data_access_factory(ServerConfig.data_access_type) self.study_id = study_id self.web_app_user_id = web_app_user_id self.invalid_values = set(['', ' ', None, 'None']) self.required_columns = set(['library_construction_protocol', 'experiment_design_description', 'taxon_id', 'description']) self.study_info = None self.logger = logger
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Some needed variables otu_seqs_file = opts.otu_seqs_file debug = opts.debug data_access = data_access_factory(ServerConfig.data_access_type) # Load the GG sequences load_gg_seqs(otu_seqs_file, data_access)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle if opts.submit_to_test_db: data_access = data_access_factory(DataAccessType.qiime_test) else: data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass input_dir=opts.input_otu_dir load_otu_mapping(data_access,input_dir)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Some needed variables otu_seqs_file = opts.otu_seqs_file debug = opts.debug data_access = data_access_factory(ServerConfig.data_access_type) # Get results for all processed_data_ folders in this study's directory load_gg_seqs(otu_seqs_file, data_access)
def run_process_illumina_through_split_lib(study_id,run_prefix,input_fp, mapping_fp, output_dir, command_handler, params, qiime_config, write_to_all_fasta=False, status_update_callback=print_to_stdout): """ NOTE: Parts of this function are a directly copied from the run_qiime_data_preparation function from the workflow.py library file in QIIME. The steps performed by this function are: 1) De-multiplex sequences. (split_libraries_fastq.py) """ # Prepare some variables for the later steps filenames=input_fp.split(',') commands = [] create_dir(output_dir) python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) # copy the mapping file copied_mapping=split(mapping_fp)[-1] mapping_input_fp_copy=join(output_dir, copied_mapping) copy_mapping_cmd='cp %s %s' % (mapping_fp,mapping_input_fp_copy) commands.append([('CopyMapping', copy_mapping_cmd)]) # sort the filenames filenames.sort() # determine which file is seq-file and which is barcode-file and associate # to mapping file if len(filenames) == 1: try: # Format of sample_id needs to be seqs_<sample_name>.<sequence_prep_id>.fastq data_access = data_access_factory(ServerConfig.data_access_type) sql = """ select s.sample_name || '.' || sp.sequence_prep_id from sample s inner join sequence_prep sp on s.sample_id = sp.sample_id where s.study_id = {0} and sp.run_prefix = '{1}' """.format(study_id, run_prefix[:-1]) sample_and_prep = data_access.dynamicMetadataSelect(sql).fetchone()[0] input_str = '-i {0} --sample_id {1}'.format(filenames[0], sample_and_prep) except Exception, e: error = 'Failed to obtain sample and sequence prep info for study_id {0} and run_prefix {1}\n'.format(study_id, run_prefix) error += 'SQL was: \n {0} \n'.format(sql) error += 'Original exception was: \n {0}'.format(str(e)) raise Exception(error)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle if opts.submit_to_test_db: data_access = data_access_factory(DataAccessType.qiime_test) else: data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass fasta_files=opts.processed_fasta_fnames study_id=opts.study_id analysis_id=submit_processed_data_to_db(data_access,fasta_files=fasta_files,metadata_study_id=study_id)
def submitJobsToQiime(study_id, user_id, mapping_file_dir,process_only,submit_to_test_db): # Instantiate one copy of data access for this process data_access = data_access_factory(ServerConfig.data_access_type) # Get the SFF files associated to this study sff_files = data_access.getSFFFiles(study_id) # Get the SFF files associated to this study sequencing_platform = data_access.getStudyPlatform(study_id) # Generate the mapping files mapping_files = writeMappingFiles(study_id, data_access, mapping_file_dir) # Figure out which mapping file pairs with each SFF file file_map = {} param_map={} for mapping_file in mapping_files: # Skip the mapping file if it's not of the correct naming format if len(mapping_file.split('__')) != 2: continue run_prefix = os.path.basename(mapping_file).split('__')[0] matching_sff_files = [] # Find the proper params file barcode_length = data_access.checkRunPrefixBarcodeLengths(study_id, run_prefix) param_file = '%s/git/qiime_web_app/python_code/parameter_files/%s__custom_parameters_uclust_ref_gg97.txt' % (ServerConfig.home,str(barcode_length)) param_map[mapping_file]=param_file for sff_file in sff_files: sff_file_basename = os.path.splitext(os.path.basename(sff_file))[0].upper() # If the run_prefix matches the SFF file name exactly, assume only # one SFF for this run if run_prefix.upper() == os.path.splitext(sff_file_basename)[0].upper(): matching_sff_files.append(sff_file) file_map[mapping_file] = matching_sff_files continue # If the run_prefix is contained in the file name, find all that match # and submit them together with the current mapping file elif sff_file_basename.startswith(run_prefix.upper()): # If it's the first item for this mapping file name, assign the list if not file_map.get(mapping_file): file_map[mapping_file] = matching_sff_files file_map[mapping_file].append(sff_file) # If we get here, there are extra SFF files with no matching mapping file. # For now, do nothing... may need to add some handling code at a later date. else: pass # Submit jobs to the queue for mapping_file in file_map: submitJob(study_id, user_id, param_map[mapping_file], mapping_file, sequencing_platform, file_map[mapping_file],process_only,submit_to_test_db, data_access)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) # get command-line parameters rep_set_fp=opts.rep_set_fp otu_map_fp=opts.otu_map_fp sequence_source=opts.sequence_source output_dir = opts.output_dir # create output directory create_dir(output_dir) # get data access connection data_access = data_access_factory(ServerConfig.data_access_type) # write PL/SQL query statement statement="select rf.ssu_sequence_id,rf.reference_id from gg_plus_denovo_reference rf " +\ "inner join sequence_source ss on rf.sequence_source_id=ss.sequence_source_id " +\ "where ss.source_name='%s'" % (sequence_source) #print statement # Run the statement con = data_access.getSFFDatabaseConnection() cur = con.cursor() # create a lookup dictionary results = cur.execute(statement) #print results ssu_id_to_ref_id={} for i in results: ssu_id_to_ref_id[str(i[0])]=str(i[1]) # write new fasta file with updated assignments new_fasta_fp=join(output_dir,'rep_set_reassigned_otu_ids.fasta') openfasta=open(new_fasta_fp,'w') # write a mapping file for topiary explorer new_map_fp=join(output_dir,'new_otu_id_mapping.txt') openmap=open(new_map_fp,'w') # parse and write new fasta file seqs=MinimalFastaParser(open(rep_set_fp,'U')) for seq_name,seq in seqs: seqs_name_split=seq_name.split() tmp_store=seqs_name_split[0] seqs_name_split[0]=seqs_name_split[1] seqs_name_split[1]=tmp_store openfasta.write('>%s\n%s\n' % (' '.join(seqs_name_split),seq)) openmap.write('%s\t%s\n' % (seqs_name_split[0],sequence_source)) openfasta.close() openmap.close()
def test_print_study_info_and_values_table(self): """ test_print_study_info_and_values_table: This function write the Study summary information below the select-box """ data_access = data_access_factory(ServerConfig.data_access_type) analysis_data=[] results=data_access.getQiimeSffDbSummary(0) for row in results: analysis_data.append(row) self.assertEqual(print_study_info_and_values_table(analysis_data,data_access),exp_output)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) submit_to_test_db=opts.submit_to_test_db fasta_file_paths=opts.fasta_file_paths study_id=opts.study_id output_dir=opts.output_dir platform=opts.platform user_id=opts.user_id if submit_to_test_db == 'False': # Load the data into the database data_access = data_access_factory(ServerConfig.data_access_type) else: # Load the data into the database data_access = data_access_factory(DataAccessType.qiime_test) # Get all of the fasta files if (platform=='FLX' or platform=='TITANIUM'): print 'Submitting SFF data to database...' analysis_id = submit_sff_and_split_lib(data_access, fasta_file_paths, study_id) elif platform=='ILLUMINA': print 'Submitting Illumina data to database...' analysis_id = submit_illumina_and_split_lib(data_access, fasta_file_paths, study_id,output_dir) elif platform=='FASTA': print 'Submitting FASTA data to database...' analysis_id = submit_fasta_and_split_lib(data_access, fasta_file_paths, study_id, output_dir) study_info=data_access.getStudyInfo(study_id,user_id) if study_info['investigation_type'].lower() == 'metagenome': # skip OTU loading pass else: print 'Submitting OTU data to database...' load_otu_mapping(data_access, output_dir, analysis_id) print 'Completed database loading.'
def validateFileExistence(study_id, study_dir): ''' check the existence of sequence files in the filesystem for each sequence filename in the database. ''' data_access = data_access_factory(ServerConfig.data_access_type) absence_list = [] for filename in data_access.getSFFFiles(study_id): filename = os.path.basename(filename) file_list = os.listdir(study_dir) if not filename in file_list: absence_list.append(filename) return absence_list
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) really = opts.really # if level is set to DEBUG log messages will be written logging.basicConfig(filename=opts.output_log_fp, level=logging.DEBUG, \ format='[%(asctime)s].%(levelname)s: %(message)s') ag_data_access = data_access_factory(ServerConfig.data_access_type, 'american_gut') # cursor to update the sent e-mails con = ag_data_access.getMetadataDatabaseConnection() cursor = ag_data_access.dynamicMetadataSelect(""" select al.name, al.email, ak.kit_verification_code, ak.supplied_kit_id, ak.kit_password, ak.swabs_per_kit from ag_login al inner join ag_kit ak on al.ag_login_id = ak.ag_login_id where ak.verification_email_sent = 'n' order by al.email""") for entry in cursor: recipient_name, target_email, verification_code, supplied_kit_id,\ kit_password, swabs_per_kit = entry logging.debug('\n+++++++++++++++++++++++++++++++++++++++++++++++++++\n') logging.debug("""recipient_name {0}, target_email {1}, """ """verification_code {2}, supplied_kit_id {3}, kit_password {4}, """ """swabs_per_kit {5}\n""".format(recipient_name, target_email, verification_code, supplied_kit_id, kit_password, swabs_per_kit)) buffer_message = BODY_MESSAGE.format(recipient_name, supplied_kit_id, verification_code) try: logging.debug('Message is %s\n' % buffer_message) logging.debug('Sent to %s\n' % target_email) if really == True: send_email(buffer_message, SUBJECT, target_email) query_string = "update ag_kit set verification_email_sent = 'y' where supplied_kit_id = '{0}'".format(supplied_kit_id) con.cursor().execute(query_string) con.cursor().execute('commit') else: logging.debug('DRY RUNNING, NOT SENDING A MESSAGE\n') except Exception, e: logging.debug('Exception value is %s\n' % str(e)) logging.debug('ERROR SENDING TO: %s' % target_email) logging.debug('+++++++++++++++++++++++++++++++++++++++++++++++++++\n\n')
def exportStudyToEBISRA(study_id, user_id): # Instantiate one copy of data access for this process data_access = None try: data_access = data_access_factory(ServerConfig.data_access_type) # Submit the job job_id = data_access.createTorqueJob('ExportToEBISRAHandler', 'StudyID=%s' % study_id, user_id, study_id) # Make sure a legit job_id was created. If not, inform the user there was a problem if job_id < 0: raise Exception('There was an error creating the job. Please contact the system administrator.') finally: data_access = None
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) submit_to_test_db=opts.submit_to_test_db fasta_file_paths=opts.fasta_file_paths study_id=opts.study_id output_dir=opts.output_dir if submit_to_test_db == 'False': # Load the data into the database data_access = data_access_factory(ServerConfig.data_access_type) else: # Load the data into the database data_access = data_access_factory(DataAccessType.qiime_test) # Get all of the fasta files print 'Submitting SFF data to database...' analysis_id = submit_sff_and_split_lib(data_access, fasta_file_paths, study_id) print 'Submitting OTU data to database...' load_otu_mapping(data_access, output_dir, analysis_id) print 'Completed database loading.'
def run(self): da = data_access_factory(ServerConfig.data_access_type) item_count = len(self.item_list) for item in self.item_list: # Reset the key_field key_field = None # Put the parts into more meaningful variables parts = item.split(':') field_type = parts[0] row_num = parts[1] field_name = parts[3] field_value = self.form[item] # Figure out what the key field is going to be if field_type == 'sample': key_field = self.sample_key_fields[row_num] elif field_type == 'prep': key_field = self.prep_key_fields[row_num] elif field_type == 'study': key_field = self.study_name if len(self.host_key_fields) > 0 and field_type == 'sample': try: host_key_field = self.host_key_fields[row_num] except: # Do nothing if not found pass else: host_key_field = None # Just in case... if key_field == None: continue # For oracle, clean up single quotes field_value = field_value.replace('\'', '\'\'') try: result = da.writeMetadataValue(field_type, key_field, field_name, field_value, \ self.study_id, host_key_field, row_num, self.lock) # Notify parent that an item was inserted self.updateCallback() except Exception, e: self.errorCallback(e)
def test_process_items(self): """ test_process_items: This fxn processes the md5's and checks against the DB """ #make sure files gets cleaned up self.files_to_remove.append(self.leftovers) self.files_to_remove.append(self.input_fasta) otu_map={} data_access = data_access_factory(ServerConfig.data_access_type) process_items(md5_list, md5_sequence_map, md5_seq_id_map, otu_map, data_access, open(self.leftovers,'w')) #check the outputs are correct self.assertEqual(open(self.leftovers).read(),exp_failures2) self.assertEqual(len(otu_map.keys()),1)
def submit_mapping_to_database(processed_results, debug=True): data_access = data_access_factory(ServerConfig.data_access_type) # Iterate over each folder's data - can be many processed_data_ folders for a single study for directory in processed_results: # Unpack the values for each processed_data_ directory mapping, seq_header_lines, otu_header_lines = processed_results[directory] # Unpack and iterate over each mapping for sample_name, sequence_count, otu_count, percent_assignment in mapping: sequence_prep_id = sample_name.split('.')[-1] # Write values to database for this sequence_prep_id data_access.updateSeqOtuCounts(sequence_prep_id, sequence_count, otu_count, percent_assignment) if debug: print 'added to database: prep: {0}, seq_count: {1}, otu_count: {2}'.format(\ str(sequence_prep_id), str(sequence_count), str(otu_count))
def __init__(self, study_id, web_app_user_id, debug = False): self.hostname = None self.study_url = None self.sample_url = None self.library_url = None self.sequence_url = None self.study_id = study_id self.base_study_path = '/home/wwwuser/user_data/studies/study_{0}'.format(study_id) self.web_app_user_id = web_app_user_id # Set up a logger so we can see what's going on log_file_path = join(self.base_study_path, 'ebi_export_log.txt') self.logger = DataLogger(log_file_path, debug) self.rest_data_helper = RestDataHelper(study_id, web_app_user_id, self.logger) self.data_access = data_access_factory(ServerConfig.data_access_type) #self.errors = [] self.debug = debug
def export_db_to_fasta(output_fasta_name): """ Exports sequences to fasta that have corresponding metadata This function exports all sequences to fasta which have corresponding metadata in the metadata schema. It will skip the rest. It DOES export private samples. """ output_fasta = open(output_fasta_name, 'w') data_access = data_access_factory(ServerConfig.data_access_type) # Get all studies from the database results = data_access.getUserStudyNames(12161, 1,'qiime') for study_id, study_name,t,s in results: print '------------------------ Exporting data from study ID: %s' % study_id print study_name print '\n\n' export_fasta_from_study(study_id, output_fasta)
def export_fasta_from_sample(study_id, sample_id, output_fasta): # If name passed is a string, open the file. Otherwise ignore as the file # has already been opened by the parent file_opened_here = False if isinstance(output_fasta, str): output_fasta = open(output_fasta, 'w') file_opened_here = True # Get our copy of data_access data_access = data_access_factory(ServerConfig.data_access_type) seqs = data_access.getSequencesFromSample(study_id, sample_id) #print '------------------------------ Seqs for sample ID: %s' % str(sample_id) for seq in seqs: output_fasta.write('>%s\n%s\n' % (seq, seqs[seq])) #print seq # Close the file if opened in this function if file_opened_here: output_fasta.close()
def export_fasta_from_study(study_id, output_fasta): # If name passed is a string, open the file. Otherwise ignore as the file # has already been opened by the parent file_opened_here = False if isinstance(output_fasta, str): output_fasta = open(output_fasta, 'w') file_opened_here = True # Get our copy of data access data_access = data_access_factory(ServerConfig.data_access_type) # Get all samples for this study sample_ids = data_access.getSampleIDsFromStudy(study_id) for sample_id in sample_ids: export_fasta_from_sample(study_id, sample_id, output_fasta) # Close the file if opened in this function if file_opened_here: output_fasta.close()