def sample_specific(self, done_files=[]): my_log = logging.getLogger('train:sample_specific') # check if sample specific data is provided: if self.config.settings["sample_specific_dir"] == "": my_log.debug('no sample specific data') return if utils.dir_is_empty(self.config.settings["sample_specific_dir"]): my_log.debug('no sample specific data') return genomes_exclude_ss = [] n_sequences_ss = 0 my_log.info("Processing sample specific data (all data will be used)...") for e in os.listdir(self.config.settings["sample_specific_dir"]): # valid extension? if len(self.config.settings["extensions"]) != 0: ext = e.split(".")[-1] if ext == "" or ext not in self.config.settings["extensions"]: continue organism = e.split(".", 1)[0] if organism == "": my_log.warning("Invalid sample specific file: {} skipping..".format(e)) continue # map organism if organism in self.nodes: node = organism else: node = self.get_mapped_organism(organism) if node is None: my_log.info("Could not map {} on the tree".format(organism)) continue elif str(node) == "1": my_log.info("Skipping {} due to lack of mapping".format(organism)) continue seq_concat, definition = utils.get_sequence_infos(os.path.join(self.config.settings["sample_specific_dir"], e)) for i in range(len(self.config.settings["fragment_len"])): fl = self.config.settings["fragment_len"][i] try: step = self.config.settings["sample_specific_step"][i] except IndexError: # shgould not happen at all step = fl if step == 0 or step is None: step = fl if len(seq_concat) < fl: my_log.debug("No sample specific data for organism {o} at frag_len {fl}".format(o=organism, fl=fl)) continue number_frags = (len(seq_concat)-fl)/step sample_dir = os.path.join(self.config.settings["project_dir"], "sampled_fasta") fl_dir = os.path.join(sample_dir, str(fl)) fastafile = os.path.join(fl_dir, e) utils.write_fragments(fl, step, seq_concat, definition, node, fastafile, range(number_frags)) # if self.stat is not None: # this is very inefficient at the moment !!! # self.stat.succesfully_written(fastafile) # self.stat.write_backup(self.backupdir) n_sequences_ss += 1 if n_sequences_ss == 0: my_log.error("no data processed in SAMPLE_SPECIFIC_DIR") if len(genomes_exclude_ss) != 0: my_log.info("(excluded {} genomes from SS)".format(str(len(genomes_exclude_ss)))) my_log.info("{} SS sequences done.".format(str(n_sequences_ss)))
def generate_seq(self, done_files=[]): my_log = logging.getLogger('train:generate_seq') # tree_organism_map contains node as keys and organism array as values nodesdone = 0 for node in self.tree_organism_map: orgs = self.tree_organism_map[node] files = [] pairs = [] for o in orgs: if o in self.organisms_invalid: # invalid organism continue for f in self.organism_file_map[o]: files.append(f) pairs.append((os.path.getsize(f), f)) if len(files) == 0: my_log.debug("no files for this node") continue n_frag_per_file = max(int(math.ceil(self.n_frags_per_node/len(files))), 1) if n_frag_per_file == 0: continue nodesdone += 1 my_log.info("processing node: {lab} ({current}/{of})".format(current=str(nodesdone), lab=node, of=str(len(self.tree_organism_map.keys())))) my_log.debug("files for this node:{nr}\tfragments per file:{n_frag}".format(nr=str(len(files)), n_frag=str(n_frag_per_file))) # frag_len_to_frags_so_far = {} # how many fragments have been taken for each fragment length? if n_frag_per_file == 1: # Ivan added # sort list of files by file size # this means you will get fragments for organisms with much data first pairs.sort(key=lambda s: s[0], reverse=True) # Ivan added pairs = pairs[:self.n_frags_per_node] # Ivan added files = [] # Ivan added for p in pairs: # Ivan added files.append(p[1]) # Ivan added for f in files: filename = os.path.basename(f) # read in fasta sequences and definitions from ncbi_dir/..... seq_concat, definition = utils.get_sequence_infos(f) # remove non-ACGT to get a better normalized k-mer vector seq_concat = utils.filter_sequence(max(self.config.settings["kmer"]), seq_concat) for fl in self.config.settings["fragment_len"]: if len(seq_concat) < fl: continue # determine how many fragments to take n_frag = int(math.floor(len(seq_concat)/fl)) if n_frag == 0: continue sampled_frag = range(n_frag) if n_frag > n_frag_per_file: # choose some random fragments # get randomized list sampled_frag # set the seed to sequence length to make the functionality deterministic random.seed(len(seq_concat)) random.shuffle(sampled_frag) # choose n_frag_per_file random elements sampled_frag = sampled_frag[0:n_frag_per_file] sample_dir = os.path.join(self.config.settings["project_dir"], "sampled_fasta") fl_dir = os.path.join(sample_dir, str(fl)) fastafile = os.path.join(fl_dir, filename) if fastafile in done_files: continue # if n_frag_per_file == 1: # if fl not in frag_len_to_frags_so_far: # frag_len_to_frags_so_far[fl] = len(sampled_frag) # else: # frag_len_to_frags_so_far[fl] += len(sampled_frag) # if frag_len_to_frags_so_far[fl] > self.n_frags_per_node: # already took enough fragments # break utils.write_fragments(fl, fl, seq_concat, definition, node, fastafile, sampled_frag)