Example #1
0
    def sample_specific(self, done_files=[]):
        my_log = logging.getLogger('train:sample_specific')
        #     check if sample specific data is provided:
        if self.config.settings["sample_specific_dir"] == "":
            my_log.debug('no sample specific data')
            return
        if utils.dir_is_empty(self.config.settings["sample_specific_dir"]):
            my_log.debug('no sample specific data')
            return

        genomes_exclude_ss = []
        n_sequences_ss = 0
        my_log.info("Processing sample specific data (all data will be used)...")

        for e in os.listdir(self.config.settings["sample_specific_dir"]):
            # valid extension?
            if len(self.config.settings["extensions"]) != 0:
                ext = e.split(".")[-1]
                if ext == "" or ext not in self.config.settings["extensions"]:
                    continue
            organism = e.split(".", 1)[0]
            if organism == "":
                my_log.warning("Invalid sample specific file: {} skipping..".format(e))
                continue

            # map organism
            if organism in self.nodes:
                node = organism
            else:
                node = self.get_mapped_organism(organism)

            if node is None:
                my_log.info("Could not map {} on the tree".format(organism))
                continue
            elif str(node) == "1":
                my_log.info("Skipping {} due to lack of mapping".format(organism))
                continue

            seq_concat, definition = utils.get_sequence_infos(os.path.join(self.config.settings["sample_specific_dir"], e))

            for i in range(len(self.config.settings["fragment_len"])):
                fl = self.config.settings["fragment_len"][i]
                try:
                    step = self.config.settings["sample_specific_step"][i]
                except IndexError:  # shgould not happen at all
                    step = fl

                if step == 0 or step is None:
                    step = fl

                if len(seq_concat) < fl:
                    my_log.debug("No sample specific data for organism {o} at frag_len {fl}".format(o=organism, fl=fl))
                    continue

                number_frags = (len(seq_concat)-fl)/step
                sample_dir = os.path.join(self.config.settings["project_dir"], "sampled_fasta")
                fl_dir = os.path.join(sample_dir, str(fl))
                fastafile = os.path.join(fl_dir, e)

                utils.write_fragments(fl, step, seq_concat, definition, node, fastafile, range(number_frags))

            #    if self.stat is not None:      #  this is very inefficient at the moment !!!
            #        self.stat.succesfully_written(fastafile)
            #        self.stat.write_backup(self.backupdir)

            n_sequences_ss += 1

        if n_sequences_ss == 0:
            my_log.error("no data processed in SAMPLE_SPECIFIC_DIR")
        if len(genomes_exclude_ss) != 0:
            my_log.info("(excluded {} genomes from SS)".format(str(len(genomes_exclude_ss))))
        my_log.info("{} SS sequences done.".format(str(n_sequences_ss)))
Example #2
0
    def generate_seq(self, done_files=[]):
        my_log = logging.getLogger('train:generate_seq')
        # tree_organism_map contains node as keys and organism array as values
        nodesdone = 0
        for node in self.tree_organism_map:
            orgs = self.tree_organism_map[node]
            files = []
            pairs = []
            for o in orgs:
                if o in self.organisms_invalid:
                    # invalid organism
                    continue
                for f in self.organism_file_map[o]:
                    files.append(f)
                    pairs.append((os.path.getsize(f), f))

            if len(files) == 0:
                my_log.debug("no files for this node")
                continue

            n_frag_per_file = max(int(math.ceil(self.n_frags_per_node/len(files))), 1)
            if n_frag_per_file == 0:
                continue

            nodesdone += 1
            my_log.info("processing node: {lab} ({current}/{of})".format(current=str(nodesdone),
                                                                         lab=node,
                                                                         of=str(len(self.tree_organism_map.keys()))))
            my_log.debug("files for this node:{nr}\tfragments per file:{n_frag}".format(nr=str(len(files)),
                                                                                        n_frag=str(n_frag_per_file)))

         #   frag_len_to_frags_so_far = {}  
            # how many fragments have been taken for each fragment length?
            if n_frag_per_file == 1:  # Ivan added
                # sort list of files by file size
                # this means you will get fragments for organisms with much data first
                pairs.sort(key=lambda s: s[0], reverse=True)  # Ivan added
                pairs = pairs[:self.n_frags_per_node]  # Ivan added
                files = []  # Ivan added
                for p in pairs:  # Ivan added
                    files.append(p[1])  # Ivan added

            for f in files:
                filename = os.path.basename(f)
                # read in fasta sequences and definitions from ncbi_dir/.....
                seq_concat, definition = utils.get_sequence_infos(f)
                #     remove non-ACGT to get a better normalized k-mer vector
                seq_concat = utils.filter_sequence(max(self.config.settings["kmer"]), seq_concat)

                for fl in self.config.settings["fragment_len"]:
                    if len(seq_concat) < fl:
                        continue
                    # determine how many fragments to take

                    n_frag = int(math.floor(len(seq_concat)/fl))
                    if n_frag == 0:
                        continue

                    sampled_frag = range(n_frag)
                    if n_frag > n_frag_per_file:
                        # choose some random fragments
                        # get randomized list sampled_frag
                        # set the seed to sequence length to make the functionality deterministic
                        random.seed(len(seq_concat))
                        random.shuffle(sampled_frag)
                        # choose n_frag_per_file random elements
                        sampled_frag = sampled_frag[0:n_frag_per_file]

                    sample_dir = os.path.join(self.config.settings["project_dir"], "sampled_fasta")
                    fl_dir = os.path.join(sample_dir, str(fl))

                    fastafile = os.path.join(fl_dir, filename)

                    if fastafile in done_files:
                        continue

              #      if n_frag_per_file == 1:
              #          if fl not in frag_len_to_frags_so_far:
              #              frag_len_to_frags_so_far[fl] = len(sampled_frag)
              #          else:
              #              frag_len_to_frags_so_far[fl] += len(sampled_frag)
              #          if frag_len_to_frags_so_far[fl] > self.n_frags_per_node:  # already took enough fragments
              #              break

                    utils.write_fragments(fl, fl, seq_concat, definition, node, fastafile, sampled_frag)