Esempio n. 1
0
    def store_atomic_data_for_contigs_and_splits(self, sample_id, contigs, db):
        self.progress.new('Storing atomic_data')

        num_contigs = pp(len(contigs))
        cur_contig = 1

        # this loop will get atomic_data information from Contig instanes and store them into the db
        # at once. this was broken down into about 10 functions, but this structure seems to be the most efficient
        # although it looks crappy:
        for contig_name in contigs:
            self.progress.update("Processing contig %s of %s" % (pp(cur_contig), num_contigs))
            contig = contigs[contig_name]
            contig_atomic_data = contig.get_atomic_data_dict()

            self.atomic_data_contigs[contig.name] = {'contig': contig.name}
            for atomic_data_field in t.atomic_data_table_structure[1:]:
                self.atomic_data_contigs[contig.name][atomic_data_field] = contig_atomic_data[atomic_data_field]

            # contig is done, deal with splits in it:
            for split in contig.splits:
                split_atomic_data = split.get_atomic_data_dict()
                self.atomic_data_splits[split.name] = {'contig': split.name}
                for atomic_data_field in t.atomic_data_table_structure[1:]:
                    self.atomic_data_splits[split.name][atomic_data_field] = split_atomic_data[atomic_data_field]


        self.progress.update("Generating tables ...")
        gen_atomic_data_tables_for_contigs_and_splits(self.atomic_data_splits, self.atomic_data_contigs, db)
        self.progress.end()
Esempio n. 2
0
    def clusterize(self, parts):
        # create a 8 digits random identifier for cluster jobs:
        identifier = ''.join(
            random.choice(string.ascii_uppercase) for x in range(10))

        for part in parts:
            command = self.command % {'binary': self.binary, 'part': part}

            # create sh file
            shell_script = part + '.sh'
            open(shell_script, 'w').write(QSUB_SCRIPT % {
                'log': part + '.log',
                'identifier': identifier,
                'command': command
            })

            # submit script to cluster
            utils.run_command('qsub %s' % shell_script)

        while True:
            qstat_info = self.get_qstat_info(identifier)
            total_processes = sum(qstat_info.values())
            if total_processes == 0:
                break

            self.progress.update(
                'Qstat Info :: Total Jobs: %s, %s' %
                (pp(total_processes), ', '.join(
                    ['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info])))

            time.sleep(5)

        return True
Esempio n. 3
0
    def clusterize(self, parts):
        # create a 8 digits random identifier for cluster jobs:
        identifier = ''.join(random.choice(string.ascii_uppercase) for x in range(10))

        for part in parts:
            command = self.command % {'binary': self.binary, 'part': part}

            # create sh file
            shell_script = part + '.sh'
            open(shell_script, 'w').write(QSUB_SCRIPT % {'log': part + '.log',
                                                         'identifier': identifier,
                                                         'command': command})

            # submit script to cluster
            utils.run_command('qsub %s' % shell_script)


        while True:
            qstat_info = self.get_qstat_info(identifier)
            total_processes = sum(qstat_info.values())
            if total_processes == 0:
                break

            self.progress.update('Qstat Info :: Total Jobs: %s, %s' % (pp(total_processes),
                       ', '.join(['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info])))

            time.sleep(5)

        return True
Esempio n. 4
0
    def store_atomic_data_for_contigs_and_splits(self, sample_id, contigs, db):
        self.progress.new('Storing atomic_data')

        num_contigs = pp(len(contigs))
        cur_contig = 1

        # this loop will get atomic_data information from Contig instanes and store them into the db
        # at once. this was broken down into about 10 functions, but this structure seems to be the most efficient
        # although it looks crappy:
        for contig_name in contigs:
            self.progress.update("Processing contig %s of %s" %
                                 (pp(cur_contig), num_contigs))
            contig = contigs[contig_name]
            contig_atomic_data = contig.get_atomic_data_dict()

            self.atomic_data_contigs[contig.name] = {'contig': contig.name}
            for atomic_data_field in t.atomic_data_table_structure[1:]:
                self.atomic_data_contigs[contig.name][
                    atomic_data_field] = contig_atomic_data[atomic_data_field]

            # contig is done, deal with splits in it:
            for split in contig.splits:
                split_atomic_data = split.get_atomic_data_dict()
                self.atomic_data_splits[split.name] = {'contig': split.name}
                for atomic_data_field in t.atomic_data_table_structure[1:]:
                    self.atomic_data_splits[
                        split.name][atomic_data_field] = split_atomic_data[
                            atomic_data_field]

        self.progress.update("Generating tables ...")
        gen_atomic_data_tables_for_contigs_and_splits(self.atomic_data_splits,
                                                      self.atomic_data_contigs,
                                                      db)
        self.progress.end()
Esempio n. 5
0
    def split_input_file(self):
        parts = []
        next_part = 1
        part_obj = None

        if self.input_is_fasta:
            fasta = u.SequenceSource(self.input_file_path)

            while fasta.next():
                if (fasta.pos - 1) % self.num_entries_per_file == 0:
                    self.progress.update('Creating part: ~ %s' %
                                         (pp(next_part)))

                    if part_obj:
                        part_obj.close()

                    file_path = os.path.join(self.tmp_dir,
                                             'part-%08d.fa' % next_part)
                    parts.append(file_path)
                    next_part += 1
                    part_obj = open(file_path, 'w')

                part_obj.write('>%s\n' % fasta.id)
                part_obj.write('%s\n' % fasta.seq)

            if part_obj:
                part_obj.close()

        return parts
Esempio n. 6
0
    def split_input_file(self):
        parts = []
        next_part = 1
        part_obj = None

        if self.input_is_fasta:
            fasta = u.SequenceSource(self.input_file_path)

            while next(fasta):
                if (fasta.pos - 1) % self.num_entries_per_file == 0:
                    self.progress.update('Creating part: ~ %s' % (pp(next_part)))

                    if part_obj:
                        part_obj.close()

                    file_path = os.path.join(self.tmp_dir, 'part-%08d.fa' % next_part)
                    parts.append(file_path)
                    next_part += 1
                    part_obj = open(file_path, 'w')

                part_obj.write('>%s\n' % fasta.id)
                part_obj.write('%s\n' % fasta.seq)

            if part_obj:
                part_obj.close()

        return parts