def store_atomic_data_for_contigs_and_splits(self, sample_id, contigs, db): self.progress.new('Storing atomic_data') num_contigs = pp(len(contigs)) cur_contig = 1 # this loop will get atomic_data information from Contig instanes and store them into the db # at once. this was broken down into about 10 functions, but this structure seems to be the most efficient # although it looks crappy: for contig_name in contigs: self.progress.update("Processing contig %s of %s" % (pp(cur_contig), num_contigs)) contig = contigs[contig_name] contig_atomic_data = contig.get_atomic_data_dict() self.atomic_data_contigs[contig.name] = {'contig': contig.name} for atomic_data_field in t.atomic_data_table_structure[1:]: self.atomic_data_contigs[contig.name][atomic_data_field] = contig_atomic_data[atomic_data_field] # contig is done, deal with splits in it: for split in contig.splits: split_atomic_data = split.get_atomic_data_dict() self.atomic_data_splits[split.name] = {'contig': split.name} for atomic_data_field in t.atomic_data_table_structure[1:]: self.atomic_data_splits[split.name][atomic_data_field] = split_atomic_data[atomic_data_field] self.progress.update("Generating tables ...") gen_atomic_data_tables_for_contigs_and_splits(self.atomic_data_splits, self.atomic_data_contigs, db) self.progress.end()
def clusterize(self, parts): # create a 8 digits random identifier for cluster jobs: identifier = ''.join( random.choice(string.ascii_uppercase) for x in range(10)) for part in parts: command = self.command % {'binary': self.binary, 'part': part} # create sh file shell_script = part + '.sh' open(shell_script, 'w').write(QSUB_SCRIPT % { 'log': part + '.log', 'identifier': identifier, 'command': command }) # submit script to cluster utils.run_command('qsub %s' % shell_script) while True: qstat_info = self.get_qstat_info(identifier) total_processes = sum(qstat_info.values()) if total_processes == 0: break self.progress.update( 'Qstat Info :: Total Jobs: %s, %s' % (pp(total_processes), ', '.join( ['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info]))) time.sleep(5) return True
def clusterize(self, parts): # create a 8 digits random identifier for cluster jobs: identifier = ''.join(random.choice(string.ascii_uppercase) for x in range(10)) for part in parts: command = self.command % {'binary': self.binary, 'part': part} # create sh file shell_script = part + '.sh' open(shell_script, 'w').write(QSUB_SCRIPT % {'log': part + '.log', 'identifier': identifier, 'command': command}) # submit script to cluster utils.run_command('qsub %s' % shell_script) while True: qstat_info = self.get_qstat_info(identifier) total_processes = sum(qstat_info.values()) if total_processes == 0: break self.progress.update('Qstat Info :: Total Jobs: %s, %s' % (pp(total_processes), ', '.join(['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info]))) time.sleep(5) return True
def store_atomic_data_for_contigs_and_splits(self, sample_id, contigs, db): self.progress.new('Storing atomic_data') num_contigs = pp(len(contigs)) cur_contig = 1 # this loop will get atomic_data information from Contig instanes and store them into the db # at once. this was broken down into about 10 functions, but this structure seems to be the most efficient # although it looks crappy: for contig_name in contigs: self.progress.update("Processing contig %s of %s" % (pp(cur_contig), num_contigs)) contig = contigs[contig_name] contig_atomic_data = contig.get_atomic_data_dict() self.atomic_data_contigs[contig.name] = {'contig': contig.name} for atomic_data_field in t.atomic_data_table_structure[1:]: self.atomic_data_contigs[contig.name][ atomic_data_field] = contig_atomic_data[atomic_data_field] # contig is done, deal with splits in it: for split in contig.splits: split_atomic_data = split.get_atomic_data_dict() self.atomic_data_splits[split.name] = {'contig': split.name} for atomic_data_field in t.atomic_data_table_structure[1:]: self.atomic_data_splits[ split.name][atomic_data_field] = split_atomic_data[ atomic_data_field] self.progress.update("Generating tables ...") gen_atomic_data_tables_for_contigs_and_splits(self.atomic_data_splits, self.atomic_data_contigs, db) self.progress.end()
def split_input_file(self): parts = [] next_part = 1 part_obj = None if self.input_is_fasta: fasta = u.SequenceSource(self.input_file_path) while fasta.next(): if (fasta.pos - 1) % self.num_entries_per_file == 0: self.progress.update('Creating part: ~ %s' % (pp(next_part))) if part_obj: part_obj.close() file_path = os.path.join(self.tmp_dir, 'part-%08d.fa' % next_part) parts.append(file_path) next_part += 1 part_obj = open(file_path, 'w') part_obj.write('>%s\n' % fasta.id) part_obj.write('%s\n' % fasta.seq) if part_obj: part_obj.close() return parts
def split_input_file(self): parts = [] next_part = 1 part_obj = None if self.input_is_fasta: fasta = u.SequenceSource(self.input_file_path) while next(fasta): if (fasta.pos - 1) % self.num_entries_per_file == 0: self.progress.update('Creating part: ~ %s' % (pp(next_part))) if part_obj: part_obj.close() file_path = os.path.join(self.tmp_dir, 'part-%08d.fa' % next_part) parts.append(file_path) next_part += 1 part_obj = open(file_path, 'w') part_obj.write('>%s\n' % fasta.id) part_obj.write('%s\n' % fasta.seq) if part_obj: part_obj.close() return parts