def register_rpchandler(cmd, rpchandler): """ register_rpchandler(cmd, rpchandler): An RPC handler is registered. The RPC handler is used to receive messages from network. Specifically, if an incoming TCP connection begins with a line containing 'cmd' string, rpchandler is called by rpchandler(data, eof, socket, address), where 'data' is the data received so far in the socket, 'eof' indicates whether there is more data still coming (if 'eof' == False, data can be partial). 'socket' is the TCP socket for remote, and 'address' is the network address of the remote end (ip address string, remote port). rpchandler() can be called many times until eof is True or more data comes. This allows incremental processing of incoming message (e.g. terminate invalid messages early). rpchandler() returns a value that is one of: RPC_MORE_DATA: the handler will want more data RPC_CLOSE: the handler asks main handler to terminate the connection RPC_RELEASE: the handler will take care of the socket from now on """ if rpc_commands.has_key(cmd): warning('Can not install RPC handler: %s already exists\n' % cmd) return False rpc_commands[cmd] = rpchandler return True
def get_expiring_file(self, dt=None, rel=None): """ Create a temp file, which expires at a given time. The temp file is stored under user's proximate directory. The file will expire (be deleted) after the given time. The actual deletion time is not very accurate. dt is a point in time, which is an instance of datetime.datetime. If dt == None, it is assumed to be now. If rel == None, it is assumed to be zero. Otherwise it is assumed to be a relative delay with respect to dt. rel is an instance of datetime.timedelta. Hint: Use scheduler.DAY and scheduler.SECOND to specify relative times """ assert(dt == None or isinstance(dt, datetime)) assert(rel == None or isinstance(rel, timedelta)) if dt == None: dt = datetime.now() if rel != None: dt = dt + rel # ISO date: YYYY-MM-DD-s, where s is a number of seconds in the day isodate = str(dt.date()) seconds = str(dt.hour * 3600 + dt.minute * 60 + dt.second) prefix = '%s-%s-%s-' % (self.EXPIRE_PREFIX, isodate, seconds) directory = self.community.get_user_dir() try: (fd, fname) = mkstemp(prefix=prefix, dir=directory) except OSError: warning('expiring_file: mkstemp() failed\n') return None xclose(fd) return fname
def rna_k_values(support, dataset_data, log): rna_rl = support.get_reads_length(dataset_data, log, ["merged reads"]) upper_k = int(rna_rl / 2) - 1 if upper_k % 2 == 0: upper_k -= 1 lower_k = min(max(int(rna_rl / 3), options_storage.RNA_MIN_K), options_storage.RNA_MAX_LOWER_K) if lower_k % 2 == 0: lower_k -= 1 use_iterative = True if upper_k <= lower_k: use_iterative = False if upper_k < options_storage.RNA_MIN_K: support.warning("\nauto K value (%d) is too small, recommended to be at least %d.\n" % (upper_k, options_storage.RNA_MIN_K)) if rna_rl <= options_storage.RNA_MIN_K: support.warning( "read length is too small (%d), but keeping current K value anyway. Consider setting K manually. K\n" % ( rna_rl)) else: upper_k = options_storage.RNA_MIN_K log.info("Upper K value is set to %d.\n" % (upper_k)) if upper_k > options_storage.MAX_K: log.info("\nAuto K value (%d) is too large, all K values should not exceed %d. Setting k=%d.\n" % (upper_k, options_storage.MAX_K, options_storage.MAX_K)) upper_k = options_storage.MAX_K if not use_iterative: return [upper_k] return [lower_k, upper_k]
def gen_key_pair_priv_cb(data, ctx): if not data: warning('keymanagement: Could not generate a key pair\n') cb(None, None) else: xrun([self.sslname, 'rsa', '-pubout'], gen_key_pair_pub_cb, data, data)
def send_lowlevel(self, user, data): ip = user.get('ip') port = user.get('port') if ip == None or port == None: warning('fetcher: No ip/port to open %s\n' % (user.tag())) return send_broadcast(ip, port, data)
def add_msg(self, msg, set_head=False): parentid = msg.get_parentid() msgid = msg.get_msgid() if msgid in self.msgdict: warning('add_msg(): Attempted to add same message twice\n') return # add to msgdict self.msgdict[msgid] = msg # update children-list of parent if parentid != '': # Create a new list of children if it does not exist parent_children = self.childdict.setdefault(parentid, []) parent_children.append(msgid) has_parent = self.msgdict.has_key(parentid) children = self.childdict.get(msgid, []) # if parent of this node is not in msgdict, this is new root node if not has_parent: self.roots.append(msgid) # join trees by removing roots of child trees for childid in children: self.roots.remove(childid) if set_head: self.headid = msgid
def got_community_profiles(self, user, reply, ctx): if reply == None: return validator = { 'cname': [ZERO_OR_MORE, str], 'profile': [ZERO_OR_MORE, {}] } if not validate(validator, reply): warning('Invalid community profiles reply\n' % str(reply)) return communities = self.get_user_communities(user) for (cname, profile) in zip(reply['cname'], reply['profile']): if cname == DEFAULT_COMMUNITY_NAME: continue com = self.get_ordinary_community(cname) if com in communities: self.update_community_profile(com, user, profile) communities.remove(com) # Do icon requests for the rest of communities for com in communities: if com.get('name') != DEFAULT_COMMUNITY_NAME: self.request_com_icon(user, com)
def save_community_icon(com, icon): # personal communities can have arbitary large icons because the picture # is not sent over network if com.get('peer') and len(icon) > TP_MAX_FACE_SIZE: warning('Community %s has too large icon picture: %d\n' %(com.get('name'), len(icon))) return False return save_image(get_community_icon_name(com, legacyname=False), icon)
def xmkdir(dirname, mode = 0700): try: mkdir(dirname, mode) except OSError, (errno, strerror): if errno != EEXIST: warning('Can not create a directory: %s\n' %(dirname)) return False
def check_dir_is_empty(dir_name): if dir_name is not None and \ os.path.exists(dir_name) and \ os.listdir(dir_name): support.warning( "output dir is not empty! Please, clean output directory before run." )
def init(): """ Bind a default and a random port. The random port is used for local network communication. The default port is used to establish remote connections. """ global community community = get_plugin_by_type(PLUGIN_TYPE_COMMUNITY) create_tcp_listener(DEFAULT_PROXIMATE_PORT, tcp_listener_accept, reuse=True) success = False for i in xrange(PORT_RETRIES): port = community.get_rpc_port() if port == DEFAULT_PROXIMATE_PORT: continue (rfd, tag) = create_tcp_listener(port, tcp_listener_accept, reuse=True) if rfd != None: info('Listening to TCP connections on port %d\n' %(port)) success = True break warning('Can not bind to TCP port %d\n' %(port)) # Generate a new port number so that next iteration will not fail if not community.gen_port(): break if not success: warning('Can not listen to TCP connections\n')
def handle_rpc_message(self, data, eof): if len(data) == 0: self.close() return False cmd = data[0:TP_MAX_CMD_NAME_LEN].split('\n')[0] rpchandler = rpc_commands.get(cmd) if rpchandler == None: self.close() return False payload = data[(len(cmd) + 1):] status = rpchandler(cmd, payload, eof, self.sock, self.address) ret = False if status == RPC_MORE_DATA: ret = True elif status == RPC_CLOSE: self.close() elif status == RPC_RELEASE: # We are not interested to gobject events anymore self.remove_io_notifications() else: self.close() warning('Unknown RPC value: %s\n' %(str(status))) return ret
def chat_cb(self, widget): uid = self.msg.get('src') user = community.get_user(uid) if user == community.get_myself(): warning('Trying to chat with yourself') return None chat.messaging_gui.start_messaging(user, False)
def read_communities(): global communities if proximatedir == None: warning('No Proximate directory\n') return # Read community meta datas for dentry in os.listdir(proximatedir): if not dentry.startswith('c_'): continue if str_to_int(dentry[2:], None) == None: continue cdir = '%s/%s' %(proximatedir, dentry) if not os.path.isdir(cdir): continue cfile = '%s/profile' %(cdir) community = Community() try: f = open(cfile, 'r') except IOError: continue profile = f.read() f.close() if community.read_profile(profile): communities[community.get('cid')] = community defcom = get_ordinary_community(DEFAULT_COMMUNITY_NAME) if defcom == None: create_community(DEFAULT_COMMUNITY_NAME)
def add_or_update_user(self, uid, updatelist, profileversion, ip, port, profile=None): user = get_user(uid) newuser = (user == None) if newuser: user = create_user(uid) if not user: warning('community: Unable to create a new user %s\n' % uid) return if ip != None: user.set('ip', ip) user.set('port', port) if newuser or user.get('v') != profileversion: user.update_attributes(updatelist, user.get('v')) if profile != None: self.got_user_profile(user, profile, None) elif not user.inprogress: debug('Fetching new profile from user %s\n' % user.tag()) request = {'t': 'uprofile'} if self.fetcher.fetch(user, PLUGIN_TYPE_COMMUNITY, request, self.got_user_profile): user.inprogress = True elif not user.present and not user.inprogress: # User appears and user profile is already up-to-date self.request_user_icon(user) self.fetch_community_profiles(user) if user.update_presence(True): self.announce_user(user)
def connect(self): ip = self.user.get('ip') port = self.user.get('port') if not community.get_network_state(community.IP_NETWORK): # Act as if we were missing the IP network warning('fetcher: IP network disabled\n') ip = None if ip == None or port == None: warning('fetcher: No ip/port to open %s\n' % (self.user.tag())) return False debug('fetcher: open from %s: %s:%s\n' % (self.user.tag(), ip, port)) if self.openingconnection == False or self.q.connect((ip, port), TP_CONNECT_TIMEOUT) == False: return False # The first write is seen by opposite side's RPC hander, not TCP_Queue prefix = '%s\n' %(TP_FETCH_RECORDS) self.q.write(prefix, writelength=False) self.q.write(fetcher.encode(firstmsg, -1, '')) # Close queue that is idle for a period of time. This is also the # maximum processing time for pending requests. Requests taking # longer than this must use other state tracking mechanisms. self.q.set_timeout(TP_FETCH_TIMEOUT) return True
def __init__(self): DBusGMainLoop(set_as_default=True) self.bus = SystemBus() self.sessionbus = SessionBus() try: self.mce = self.bus.get_object("com.nokia.mce", "/com/nokia/mce") except DBusException: warning("Nokia MCE not found. Vibra is disabled\n") return self.profiled = self.sessionbus.get_object("com.nokia.profiled", "/com/nokia/profiled") self.sessionbus.add_signal_receiver( self.profile_changed_handler, "profile_changed", "com.nokia.profiled", "com.nokia.profiled", "/com/nokia/profiled", ) profile = self.profiled.get_profile(dbus_interface="com.nokia.profiled") self.get_vibra_enabled(profile) self.register_plugin(PLUGIN_TYPE_VIBRA)
def xrun(cmd, cb, ctx, inputdata=None): """ Run 'cmd' (a list of command line arguments). Call cb(data, ctx), when the command finishes with its output given to cb() at parameter 'data'. If 'inputdata' is given, feed it to the command from stdin. The result is relayed through the gobject mainloop. """ (rfd, wfd) = xpipe() if rfd < 0: return False pid = xfork() if pid == -1: warning('Could not fork a new process\n') return False if pid != 0: xclose(wfd) return xrunwatch(rfd, cb, ctx, pid) xclose(rfd) w = fdopen(wfd, 'w') try: pipe = Popen(cmd, stdout=PIPE, stdin=PIPE) except OSError: warning('Unable to run %s\n' %(' '.join(cmd))) w.write('-1\0') abort() w.write(str(pipe.pid) + '\0') w.flush() if inputdata: try: pipein = pipe.stdin pipein.write(inputdata) except IOError: warning("IOError while writing to command %s\n" %(' '.join(cmd))) abort() pipein.close() try: pipeout = pipe.stdout result = pipeout.read() except IOError: warning("IOError while reading from command %s\n" %(' '.join(cmd))) abort() pipeout.close() pipe.wait() if pipe.returncode != 0: warning('%s did not exit cleanly\n' %(' '.join(cmd))) abort() w.write(result) w.close() abort()
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): data_dir = os.path.join(cfg.output_dir, "K%d" % K) stage = BASE_STAGE saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if options_storage.continue_mode: if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))): log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)") return if options_storage.restart_from and options_storage.restart_from.find(":") != -1: stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:] support.continue_from_here(log) if stage != BASE_STAGE: if not os.path.isdir(saves_dir): support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir)) else: if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) shutil.copytree(os.path.join(configs_dir, "debruijn"), dst_configs) # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) log.info("\n== Running assembler: " + ("K%d" % K) + "\n") if prev_K: additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta") if not os.path.isfile(additional_contigs_fname): support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log) additional_contigs_fname = None else: additional_contigs_fname = None if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one) command = [os.path.join(execution_home, "spades"), cfg_file_name] ## this code makes sense for src/debruijn/simplification.cpp: corrected_and_save_reads() function which is not used now # bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") # if os.path.isdir(bin_reads_dir): # if glob.glob(os.path.join(bin_reads_dir, "*_cor*")): # for cor_filename in glob.glob(os.path.join(bin_reads_dir, "*_cor*")): # cor_index = cor_filename.rfind("_cor") # new_bin_filename = cor_filename[:cor_index] + cor_filename[cor_index + 4:] # shutil.move(cor_filename, new_bin_filename) support.sys_call(command, log)
def create_user_communities(user): for cname in user.get('communities'): if not valid_community(cname): warning('Invalid community: %s\n' %(cname)) continue if get_ordinary_community(cname) == None: community = create_community(cname) save_communities([community])
def tcp_listener_accept(rfd, conditions): try: (sock, address) = rfd.accept() except socket.error, (errno, strerror): ret = (errno == EAGAIN or errno == EINTR) if not ret: warning('Listener: Socket error (%s): %s\n' % (errno, strerror)) return ret
def handle_message(self, user, sm): """ Handle messages that were found from other users' fileshares """ if not self.validate_message(sm): sm["ttl"] = 0 warning("msgboard: Invalid message: %s\n" % str(sm)) return warning("New message: %s\n" % sm["subject"])
def udp_listener_read(self, rfd, condition): try: data, address = rfd.recvfrom(2048) except socket.error, (errno, strerror): ret = (errno == EAGAIN or errno == EINTR) if not ret: warning('Socket error (%s): %s\n' % (errno, strerror)) return ret
def sym_enc(self, plain, passphrase): """ Encrypts message with AES using given passphrase """ ciph = xsystem([self.sslname, self.symmetric, '-e', '-pass', 'stdin'], passphrase + '\n' + plain) if not ciph: warning('keymanagement: Unable to perform symmetric encryption\n') return None return ciph
def find_significant_features(self): """ Finds features in the face, gets their scores and filters the least significant ones. In the end, self.s_features will have a list of significant feature names, ordered from high to low. """ # Load the heat map and blend with user image self._load_heat_map() self._create_blended() # Find signiicant features feature_coords = dict() for feature in FEATURES_CASCADES: feature_coords[feature] = self._find_feature( feature, FEATURES_CASCADES[feature]) if not len(feature_coords): self.err_code = 6 self.err_msg = "No significant features found." warning(self.err_msg) # Find eyebrows and forehead if eyes were found if len(feature_coords['eyes']) == 2: eyebrows = self._find_eyebrows(feature_coords['eyes']) # If found, find forehead and adjust eyes if len(eyebrows) == 2: x_eb_l, y_eb_l, w_eb_l, h_eb_l = eyebrows[0] x_eb_r, y_eb_r, w_eb_r, h_eb_r = eyebrows[1] y_eb = min(y_eb_l, y_eb_r) x_eb = x_eb_l w_eb = x_eb_r + w_eb_r - x_eb_l # Forehead feature_coords['forehead'] = [(x_eb, 0, w_eb, y_eb)] # Subtract half of eyebrows height from eyes x_e_l, y_e_l, w_e_l, h_e_l = feature_coords['eyes'][0] x_e_r, y_e_r, w_e_r, h_e_r = feature_coords['eyes'][1] h_e_l = y_e_l + h_e_l - int(y_eb_l + h_eb_l / 2) h_e_r = y_e_r + h_e_r - int(y_eb_r + h_eb_r / 2) y_e_l = int(y_eb_l + h_eb_l / 2) y_e_r = int(y_eb_r + h_eb_r / 2) feature_coords['eyes'] = [(x_e_l, y_e_l, w_e_l, h_e_l), (x_e_r, y_e_r, w_e_r, h_e_r)] feature_coords['eyebrows'] = eyebrows for feature in feature_coords: coords = feature_coords[feature] # Feature not found if not len(coords): continue debug("Feature coords: {}".format(coords)) self.s_features.append((feature, self._get_feature_score(coords))) # Sort features from highest to lowest self.s_features.sort(reverse=True, key=lambda f: f[1][-1]) debug("Found features before filtering: {}".format(self.s_features)) # Filter out un-distinct features and leave only names self.s_features = [ feature[0] for feature in self.s_features if self._is_distinct_feature(feature[1]) ]
def open_to(ptype, l, fullname): (exe, f) = find_exe(l) if exe == None: warning('Can not open a %s\n' %(ptype)) return False warning('Open %s to %s %s\n' %(fullname, ptype, exe)) args = f(fullname) os.spawnlp(os.P_NOWAIT, exe, exe, *args) return True
def read(self, fd, condition, this): try: chunk = self.sock.recv(4096) except socket.error, (errno, strerror): ret = (errno == EAGAIN or errno == EINTR) if not ret: warning('Listener: Read error (%s): %s\n' %(errno, strerror)) self.close() return ret
def encode(self, msg, rid, rt): if type(msg) != dict: warning('fetcher: message must be a dictionary: %s\n' %(str(msg)), printstack=True) return None msg.setdefault('v', 0) msg.setdefault('t', '') msg['rid'] = rid msg['rt'] = rt return bencode(msg)
def watch(self, fd, condition): try: bytes = read(fd, 4096) except OSError, (errno, strerror): if errno == EAGAIN or errno == EINTR: return True warning('xrun: Surprising error code: %d %s\n' %(errno, strerror)) self.finish(None, True) return False
def handle_community_profile_fetch(self, user, request): cname = request.get('cname') if type(cname) != str: warning('Invalid community profile fetch\n' % str(request)) return None community = self.get_ordinary_community(cname) if community == None: return None return {'cprofile': community.serialize()}
def read_users(): if proximatedir == None: warning('No Proximate directory\n') return for dentry in os.listdir(proximatedir): uid = parse_user_dentry(dentry) if uid != None: read_user_profile(uid)
def save_image(fname, image): if fname == None: return False basename = os.path.basename(fname) tmpname = fname + '.tmp' try: f = open(tmpname, 'w') except IOError, (errno, strerror): warning('Can not save face to %s: %s\n' %(tmpname, strerror)) return False
def create_udp_listener(self): port = community.get_rpc_port() info('fetcher: Listening to UDP port %d\n' % port) rfd = create_udp_socket('', port, False, reuse = True) if rfd == None: warning('Can not listen to UDP broadcasts\n') return rfd.setblocking(False) io_add_watch(rfd, IO_IN, self.udp_listener_read)
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False): to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): moved_reads_files = [] for reads_file in value: dst_filename = os.path.join(dst, os.path.basename(reads_file)) # TODO: fix problem with files with the same basenames in Hammer binary! if not os.path.isfile(reads_file): if (not gzip and os.path.isfile(dst_filename)) or ( gzip and os.path.isfile(dst_filename + '.gz')): support.warning( 'file with corrected reads (' + reads_file + ') is the same in several libraries', log) if gzip: dst_filename += '.gz' else: support.error( 'something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) else: shutil.move(reads_file, dst_filename) if gzip: to_compress.append(dst_filename) dst_filename += '.gz' moved_reads_files.append(dst_filename) reads_library[key] = moved_reads_files if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, '-f', '-7', '-p', str(max_threads), reads_file ], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): data_dir = os.path.join(cfg.output_dir, "K%d" % K) stage = BASE_STAGE saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") if options_storage.continue_mode: if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))): log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)") return if options_storage.restart_from and options_storage.restart_from.find(":") != -1 \ and options_storage.restart_from.startswith("k%d:" % K): stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:] support.continue_from_here(log) if stage != BASE_STAGE: if not os.path.isdir(saves_dir): support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir)) else: if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util._path_created = {} # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) log.info("\n== Running assembler: " + ("K%d" % K) + "\n") if prev_K: additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta") if not os.path.isfile(additional_contigs_fname): support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log) additional_contigs_fname = None else: additional_contigs_fname = None if "read_buffer_size" in cfg.__dict__: #FIXME why here??? process_cfg.substitute_params(os.path.join(dst_configs, "construction.info"), {"read_buffer_size": cfg.read_buffer_size}, log) if "scaffolding_mode" in cfg.__dict__: #FIXME why here??? process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log) prepare_config_rnaspades(os.path.join(dst_configs, "rna_mode.info"), log) prepare_config_construction(os.path.join(dst_configs, "construction.info"), log) cfg_fn = os.path.join(dst_configs, "config.info") prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home) command = [os.path.join(execution_home, "spades-core"), cfg_fn] add_configs(command, dst_configs) #print("Calling: " + " ".join(command)) support.sys_call(command, log)
def update_k_mers_in_special_cases(cur_k_mers, RL, log, silent=False): if options_storage.auto_K_allowed(): if RL >= 250: if not silent: support.warning("Default k-mer sizes were set to %s because estimated " "read length (%d) is equal to or greater than 250" % (str(options_storage.K_MERS_250), RL), log) return options_storage.K_MERS_250 if RL >= 150: if not silent: support.warning("Default k-mer sizes were set to %s because estimated " "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log) return options_storage.K_MERS_150 return cur_k_mers
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): data_dir = os.path.join(cfg.output_dir, "K%d" % K) stage = BASE_STAGE saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if options_storage.continue_mode: if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))): log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)") return if options_storage.restart_from and options_storage.restart_from.find(":") != -1: stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:] support.continue_from_here(log) if stage != BASE_STAGE: if not os.path.isdir(saves_dir): support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir)) else: if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) log.info("\n== Running assembler: " + ("K%d" % K) + "\n") if prev_K: additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta") if not os.path.isfile(additional_contigs_fname): support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log) additional_contigs_fname = None else: additional_contigs_fname = None if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home) command = [os.path.join(execution_home, "spades"), cfg_file_name] support.sys_call(command, log)
def update_k_mers_in_special_cases(cur_k_mers, RL, log): if not options_storage.k_mers and not options_storage.single_cell: # kmers were set by default and not SC if RL >= 250: support.warning( "Default k-mer sizes were set to %s because estimated " "read length (%d) is equal or great than 250" % (str(options_storage.k_mers_250), RL), log) return options_storage.k_mers_250 if RL >= 150: support.warning( "Default k-mer sizes were set to %s because estimated " "read length (%d) is equal or great than 150" % (str(options_storage.k_mers_150), RL), log) return options_storage.k_mers_150 return cur_k_mers
def from_file(self, input_filename): with open(input_filename) as f: line_number = 0 for line in f: line = line.strip() pos = line.find('#') if pos != -1: line = line[:pos] if line == '': continue if len(line.split(':')) > 2: support.warning( "In parameters file in line number " + str(line_number) + " two ':'. May be error? Everything after second ':' was ignored.") identity = line.split(':')[0].strip().lower() value = line.split(':')[1].strip() if identity == 'output directory': self.output_dir = value elif identity == 'resume from': self.resume_dir = value if value.lower() != 'none' else None elif identity == 'only models': if value.lower() == 'true': self.only_models = True elif value.lower() == 'false': self.only_models = False else: self.only_models = None elif identity == 'input file': self.input_file = value elif identity == 'population labels': self.pop_labels = value if value.lower() != 'none' else None elif identity == 'projections': self.ns = value if value.lower() != 'none' else None elif identity == 'theta0': self.theta = float( value) if value.lower() != 'none' else None elif identity == 'time for generation': self.gen_time = float( value) if value.lower() != 'none' else None elif identity == 'multinom': self.multinom = True if value.lower() == 'true' else False elif identity == 'initial structure': self.initial_structure = value elif identity == 'final structure': self.final_structure = value elif identity == 'relative parameters': self.relative_params = value.lower() == 'true' elif identity == 'no migrations': self.no_mig = value.lower() == 'true' elif identity == 'size of population in ga': self.size_of_generation = int(value) elif identity == 'fractions in ga': self.fracs = value elif identity == 'mean mutation strength': self.mutation_strength = float(value) elif identity == 'mean mutation rate': self.mutation_rate = float(value) elif identity == 'const for mutation rate': self.const_for_mut_rate = float(value) elif identity == 'const for mutation strength': self.const_for_mut_strength = float(value) elif identity == 'epsilon': self.epsilon = float(value) elif identity == 'stop iteration': self.stop_iter = int(value) elif identity == 'pts': self.dadi_pts = value if value.lower() != 'none' else None elif identity == 'use moments or dadi': if value == 'moments': self.moments_scenario = True else: self.moments_scenario = False elif identity == 'draw models every n iteration': self.draw_iter = int(value) elif identity == "print models' code every n iteration": self.code_iter = int(value) elif identity == 'units of time in drawing': if value.lower() == 'years': self.gen_time_units = 1 elif value.lower() == 'kya' or value.lower() == 'thousand years': self.gen_time_units = 1000 else: support.warning( 'Cannot understand units of time in line ' +\ str(line_number) + ' in parameters file. Years were taken.') elif identity == 'silence': self.silence = value.lower() == 'true' elif identity == 'number of repeats': self.repeats = int(value) elif identity == 'number of processes': self.processes = int(value) elif identity == 'upper bound of first split': self.split_1_lim = float( value) if value.lower() != 'none' else None elif identity == 'upper bound of second split': self.split_2_lim = float( value) if value.lower() != 'none' else None elif identity == 'name of local optimization': self.optimize_name = value if value.lower() != 'none' else None names = [ 'optimize', 'optimize_log', 'optimize_powell', 'optimize_lbfgsb', 'optimize_log_lbfgsb', 'optimize_log_fmin', 'hill_climbing'] if value not in names: support.error( "Can't parse name of local search. Acceptable names are: " + ', '.join(names)) else: pass # now extra args elif identity == 'min_n': self.min_N = float(value) elif identity == 'max_n': self.max_N = float(value) elif identity == 'min_t': self.min_T = float(value) elif identity == 'max_t': self.max_T = float(value) elif identity == 'min_m': self.min_M = float(value) elif identity == 'max_m': self.max_M = float(value) elif identity == 'verbose': self.ls_verbose = None if value.lower() == 'none' else int(value) elif identity == 'flush delay': self.ls_flush_delay = float(value) elif identity == 'epsilon for ls': self.ls_epsilon = float(value) elif identity == 'gtol': self.ls_gtol = float(value) elif identity == 'maxiter': self.ls_maxiter = None if value.lower() == 'none' else int(value) elif identity == 'mean mutation rate for hc': self.hc_mutation_rate = None if value.lower() == 'none' else float(value) elif identity == 'const for mutation rate for hc': self.hc_const_for_mutation_rate = None if value.lower() == 'none' else float(value) elif identity == 'stop iteration for hc': self.hc_stop_iter = None if value.lower() == 'none' else float(value) elif identity == 'multinomial mutation': self.multinom_mutation = value.lower() == 'true' elif identity == 'multinomial crossing': self.multinom_mutation = value.lower() == 'true' elif identity == 'random n_a': self.random_N_A = value.lower == 'true' elif identity == 'time to print summary': self.time_for_print = float(value) elif identity == 'distribution': self.distribution = value.lower() elif identity == 'std': self.std = None if value.lower() == 'none' else float(value) elif identity == 'only sudden': self.only_sudden = value.lower() == 'true' elif identity == 'custom filename': self.model_func_file = value if value.lower() != 'none' else None elif identity == 'lower bounds': self.lower_bound = value if value.lower() != 'none' else None elif identity == 'upper bounds': self.upper_bound = value if value.lower() != 'none' else None elif identity == 'parameter identifiers': self.p_ids = value if value.lower() != 'none' else None elif identity == "linked snp's" or identity == "linked snp": self.linked_snp = value.lower() == 'true' elif identity == "unlinked snp's" or identity == "unlinked snp": self.linked_snp = value.lower() == 'false' elif identity == 'directory with bootstrap' or identity == 'directory of bootstrap': self.boot_dir = value if value.lower() != 'none' else None else: support.error( 'Cannot recognize identifier: ' + str(line.split(':')[0].strip())) line_number += 1
def run_spades(configs_dir, execution_home, cfg, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], False, True) K = cfg.iterative_K[0] else: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], False, False) RL = get_read_length(cfg.output_dir, cfg.iterative_K[0]) cfg.iterative_K = update_k_mers_in_special_cases( cfg.iterative_K, RL, log) if cfg.iterative_K[1] + 1 > RL: if cfg.paired_mode: support.warning( "Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning in paired mode for the first value of K (%d)" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], False, True) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len( cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, True, last_one) if last_one: break if count < len(cfg.iterative_K): support.warning( "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) latest = os.path.join(cfg.output_dir, "K%d" % K) if os.path.isfile(os.path.join(latest, "before_rr.fasta")): if not os.path.isfile( os.path.join( os.path.dirname(cfg.result_contigs), "before_rr.fasta")) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "before_rr.fasta"), os.path.join(os.path.dirname(cfg.result_contigs), "before_rr.fasta")) if os.path.isfile(os.path.join(latest, "final_contigs.fasta")): if not os.path.isfile( cfg.result_contigs) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs) if cfg.paired_mode: if os.path.isfile(os.path.join(latest, "scaffolds.fasta")): if not os.path.isfile( cfg.result_scaffolds) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds) if cfg.developer_mode: # before repeat resolver contigs # before_RR_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "simplified_contigs.fasta") # shutil.copyfile(os.path.join(latest, "simplified_contigs.fasta"), before_RR_contigs) # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists( saves_link ): # exists return False for broken link! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) # os.remove(cfg.additional_contigs) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) return latest
if make_latest_symlink: latest_symlink = 'latest' if os.path.islink(latest_symlink): os.remove(latest_symlink) os.symlink(output_dir, latest_symlink) datasets_dict = dict() print("Analyzing datasets") for dataset in datasets: try: dataset_data = pyyaml.load(file(dataset, 'r')) except pyyaml.YAMLError, exc: support.warning('skipping ' + dataset + ': exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc)) continue dataset_data = support.correct_dataset(dataset_data) for id, library in enumerate(dataset_data): print("processing lib#" + str(id) + " of " + dataset) basename = os.path.splitext(os.path.basename(dataset))[0] cur_key = basename i = 1 while datasets_dict.has_key(cur_key): cur_key = basename + "_" + str(i) cur_reads = [] for key, value in library.items(): if key.endswith('reads'):
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg options = args cfg, dataset_data = fill_cfg(options, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params( os.path.join(options_storage.output_dir, "params.txt"), args[0]) if not options: support.error( "failed to parse command line of the previous run! Please restart from the beginning or specify another output directory." ) cfg, dataset_data = fill_cfg(options, log) if options_storage.restart_from: check_cfg_for_restart_from(cfg) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info( "\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" flag = False for v in args[1:]: if v == '-o' or v == '--restart-from': flag = True continue if flag: flag = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line:" for v in args: command += " " + v log.info(command) # special case if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type( dataset_data, 'paired-end'): support.warning( 'cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) support.check_single_reads_in_options(options, log) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads( dataset_data) or support.dataset_has_additional_contigs( dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads( dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs( dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join( options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir( tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type( dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__[ "dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg[ "dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join( misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir( os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error( "failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [ result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename ] for format in [".fasta", ".fastg"]: for old_result_file in old_result_files: if os.path.isfile(old_result_file[:-6] + format): os.remove(old_result_file[:-6] + format) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile( dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write( "single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write( "reads" + '\t' + process_cfg.process_spaces( corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces( cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg[ "dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write( process_cfg.process_spaces( cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir( misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith( 'k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error( "failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) #corrector if "mismatch_corrector" in cfg and ( os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or ( options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue for format in [".fasta", ".fastg"]: if os.path.isfile(old[:-6] + format): shutil.move(old[:-6] + format, new[:-6] + format) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size est_params_data = pyyaml.load( open(os.path.join(latest_dir, "final.lib_data"), 'r')) max_IS_library = None for reads_library in est_params_data: if reads_library['type'] == 'paired-end': if not max_IS_library or float( reads_library["insert size mean"]) > float( max_IS_library["insert size mean"]): max_IS_library = reads_library if not max_IS_library: support.error( 'Mismatch correction cannot be performed without at least one paired-end library!', log) if not max_IS_library["insert size mean"]: support.warning( 'Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction' ' based on the first paired-end library and with default insert size.', log) else: cfg["mismatch_corrector"].__dict__[ "insert-size"] = round( max_IS_library["insert size mean"]) yaml_dirname = os.path.dirname( options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list( map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list( map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['right reads'])) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value is not None: args.append(value) # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile( corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = support.get_tmp_dir( prefix="mis_cor_%s_" % assembly_type) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.join( tmp_dir_for_corrector, "corrected_contigs.fasta") # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) assembled_fastg = assembled[:-6] + ".fastg" if os.path.isfile(assembled_fastg): support.create_fastg_from_fasta( corrected, assembled_fastg, log) log.info("\n===== %s finished.\n" % STAGE_NAME) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if "error_correction" in cfg and os.path.isdir( os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces( os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces( result_contigs_filename) if os.path.isfile(result_contigs_filename[:-6] + ".fastg"): message += " (" + os.path.basename( result_contigs_filename[:-6] + ".fastg") + ")" log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces( result_scaffolds_filename) if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"): message += " (" + os.path.basename( result_scaffolds_filename[:-6] + ".fastg") + ")" log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds ) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds( result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished." ) # otherwise it finished WITH WARNINGS if options_storage.test_mode: for result_filename in [ result_contigs_filename, result_scaffolds_filename ]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error( "TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error( "TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error( "TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error( "It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def main(): os.environ["LC_ALL"] = "C" if len(sys.argv) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(sys.argv, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params( os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error( "failed to parse command line of the previous run! Please restart from the beginning." ) cfg, dataset_data = fill_cfg(options, log) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info( "\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) else: params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) command = "Command line:" for v in sys.argv: command += " " + v log.info(command) print_used_values(cfg, log) log.removeHandler(params_handler) log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads if needed if support.dataset_has_interlaced_reads(dataset_data): dir_for_split_reads = os.path.join( os.path.abspath(options_storage.output_dir), 'split_reads') if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join( options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = os.path.abspath( options_storage.dataset_yaml_filename) try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir( tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): shutil.copytree(os.path.join(spades_home, "configs"), tmp_configs_dir) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) bh_cfg.__dict__["dataset_yaml_filename"] = cfg[ "dataset"].yaml_filename corrected_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename ) and options_storage.continue_mode: log.info( "\n===== Skipping read error correction (already processed). \n" ) else: options_storage.continue_mode = False # continue from here if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if not os.path.exists(bh_cfg.tmp_dir): os.makedirs(bh_cfg.tmp_dir) log.info("\n===== Read error correction started. \n") bh_logic.run_bh(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, ext_python_modules_home, log) log.info("\n===== Read error correction finished. \n") result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join( misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename spades_cfg.__dict__["additional_contigs"] = os.path.join( spades_cfg.output_dir, "simplified_contigs.fasta") if options_storage.continue_mode and ( os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename))): log.info("\n===== Skipping assembling (already processed). \n") # calculating latest_dir for the next stages latest_dir = support.get_latest_dir( os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error( "failed to continue the previous run! Please restart from the beginning." ) else: if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if support.dataset_has_paired_reads(dataset_data): spades_cfg.__dict__["paired_mode"] = True else: spades_cfg.__dict__["paired_mode"] = False if options_storage.rectangles: spades_cfg.__dict__["resolving_mode"] = "rectangles" if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== Assembling started.\n") # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile( dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write( "single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write( "reads" + '\t' + process_cfg.process_spaces( corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces( cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg[ "dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write( process_cfg.process_spaces( os.path.abspath(cfg["dataset"].reference)) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, log) #rectangles if spades_cfg.paired_mode and options_storage.rectangles: if options_storage.continue_mode: # TODO: continue mode support.warning( "sorry, --continue doesn't work with --rectangles yet. Skipping repeat resolving." ) else: sys.path.append( os.path.join(python_modules_home, "rectangles")) import rrr rrr_input_dir = os.path.join(latest_dir, "saves") rrr_outpath = os.path.join(spades_cfg.output_dir, "rectangles") if not os.path.exists(rrr_outpath): os.mkdir(rrr_outpath) rrr_reference_information_file = os.path.join( rrr_input_dir, "late_pair_info_counted_etalon_distance.txt") rrr_test_util = rrr.TestUtils( rrr_reference_information_file, os.path.join(rrr_outpath, "rectangles.log")) rrr.resolve(rrr_input_dir, rrr_outpath, rrr_test_util, "", cfg["dataset"].single_cell, spades_cfg.careful) shutil.copyfile( os.path.join( rrr_outpath, "rectangles_extend_before_scaffold.fasta"), spades_cfg.result_contigs) shutil.copyfile( os.path.join(rrr_outpath, "rectangles_extend.fasta"), spades_cfg.result_scaffolds) if not spades_cfg.developer_mode: if os.path.exists(rrr_input_dir): shutil.rmtree(rrr_input_dir) if os.path.exists(rrr_outpath): shutil.rmtree(rrr_outpath, True) if os.path.exists(rrr_outpath): os.system('rm -r ' + rrr_outpath) #EOR if os.path.isdir( misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if os.path.isfile(spades_cfg.additional_contigs): shutil.move(spades_cfg.additional_contigs, misc_dir) log.info("\n===== Assembling finished. \n") #corrector if "mismatch_corrector" in cfg and ( os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or ( options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for k, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue shutil.move(old, new) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)): log.info( "\n===== Skipping mismatch correction (already processed). \n" ) else: log.info("\n===== Mismatch correction started.") # detecting paired-end library with the largest insert size dataset_data = pyyaml.load( open(options_storage.dataset_yaml_filename, 'r') ) ### initial dataset, i.e. before error correction dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(options_storage.dataset_yaml_filename)) paired_end_libraries_ids = [] for id, reads_library in enumerate(dataset_data): if reads_library['type'] == 'paired-end': paired_end_libraries_ids.append(id) if not len(paired_end_libraries_ids): support.error( 'Mismatch correction cannot be performed without at least one paired-end library!' ) estimated_params = load_config_from_file( os.path.join(latest_dir, "_est_params.info")) max_insert_size = -1 target_paired_end_library_id = -1 for id in paired_end_libraries_ids: if float(estimated_params.__dict__[ "insert_size_" + str(id)]) > max_insert_size: max_insert_size = float( estimated_params.__dict__["insert_size_" + str(id)]) target_paired_end_library_id = id yaml_dirname = os.path.dirname( options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list( map( lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id] ['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list( map( lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id] ['right reads'])) cfg["mismatch_corrector"].__dict__["insert-size"] = round( max_insert_size) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value: args.append(value) # processing contigs and scaffolds (or only contigs) for k, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile( corrected): log.info("\n== Skipping processing of " + k + " (already processed)\n") continue options_storage.continue_mode = False log.info("\n== Processing of " + k + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = os.path.join( corrector_cfg.__dict__["output-dir"], "mismatch_corrector_" + k) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.abspath( os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")) # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) log.info("\n===== Mismatch correction finished.\n") if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + os.path.dirname(corrected_dataset_yaml_filename) + "/") if os.path.isfile(result_contigs_filename): log.info(" * Assembled contigs are in " + result_contigs_filename) if os.path.isfile(result_scaffolds_filename): log.info(" * Assembled scaffolds are in " + result_scaffolds_filename) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") threshold = 3 if not os.path.isfile(result_broken_scaffolds ) or not options_storage.continue_mode: support.break_scaffolds(result_scaffolds_filename, threshold, result_broken_scaffolds) #log.info(" * Scaffolds broken by " + str(threshold) + " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished." ) # otherwise it finished WITH WARNINGS log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: _, exc, _ = sys.exc_info() log.exception(exc) support.error("exception caught", log)
def check(self): ''' Check correctness of parameters. Unless throws error. ''' if self.multinom is None: if self.model_func_file is None: self.multinom = False else: self.multinom = True if self.pop_labels is not None: self.pop_labels = [x.strip() for x in self.pop_labels.split(',')] if self.ns is not None: self.ns = support.check_comma_sep_list(self.ns) self.input_file = support.check_file_existence(self.input_file) if self.resume_dir is not None: self.resume_dir = support.check_dir_existence(self.resume_dir) if self.resume_dir is not None and self.output_dir is None: self.output_dir = support.ensure_dir_existence( self.resume_dir + "_resumed", check_emptiness=True) elif self.output_dir is None: support.error("Parameter `Output directory` is required") else: self.output_dir = support.ensure_dir_existence( self.output_dir, check_emptiness=True) if self.input_file is None: support.error( "Parameter `Input file` is required") if self.theta is None: support.warning( "`Theta0` is not specified. It would be 1.0.") if self.gen_time is None: support.warning( "`Time for one generation` is not specified. Time will be in genetic units.") self.input_data, self.ns, self.pop_labels = support.load_spectrum( self.input_file, self.ns, self.pop_labels) self.ns = np.array(self.ns) self.number_of_populations = len(self.ns) # Linked or unlinked data if not self.linked_snp and self.boot_dir is not None: support.warning( "SNP's are marked as unlinked, so the directory with bootstrap will be ignored.") elif self.linked_snp: if self.boot_dir is not None: self.boot_dir = support.check_dir_existence(self.boot_dir) self.boots = gadma.Inference.load_bootstrap_data_from_dir(self.boot_dir, self.ns, self.pop_labels) # Custom model if self.model_func_file is not None: self.model_func_file = support.check_file_existence(self.model_func_file) file_with_model_func = imp.load_source('module', self.model_func_file) try: self.model_func = file_with_model_func.model_func except: support.error( "File " + self.model_func_file + ' does not contain function named `model_func`.') if self.model_func_file is not None: if self.p_ids is not None: self.p_ids = support.check_comma_sep_list(self.p_ids, is_int=False) self.fracs = [float(x) for x in self.fracs.split(",")] if len(self.fracs) != 3: support.error( "length of `Fractions` (Parameters of genetic algorithm) must be 3") self.frac_of_old_models = self.fracs[0] self.frac_of_mutated_models = self.fracs[1] self.frac_of_crossed_models = self.fracs[2] if self.moments_scenario and self.dadi_pts is not None: support.warning( "Moments doesn't use --pts argument, so it would be ignored") if self.dadi_pts is None: max_n = max(self.ns) self.dadi_pts = [max_n, max_n + 10, max_n + 20] else: self.dadi_pts = support.check_comma_sep_list(self.dadi_pts) self.put_default_structures() self.final_check()
def final_check(self): if self.model_func_file is not None and self.model_func_file is None: if self.p_ids is None and (self.lower_bound is None or self.upper_bound is None): support.error( "Either parameter identifiers or lower and upper bounds should be specified.") if self.model_func_file is not None and self.initial_structure is not None: support.warning( "Both structure and custom model are specified. Custom model will be optimized, structure will be ignored.") if self.model_func_file is not None and self.only_sudden: support.warning( "Both custom model and `Only sudden: True` are specified. `Only sudden` will be ignored.") if (self.frac_of_old_models + self.frac_of_crossed_models + self.frac_of_mutated_models) > 1: support.error( "Sum of Fractions (Parameters of genetic algorithm) must be less than or equal to 1") if (self.frac_of_old_models + self.frac_of_crossed_models + self.frac_of_mutated_models) == 1: support.warning("Faction of random models is 0") # check lengths of bounds and p_ids if self.model_func_file is not None: if len(self.lower_bound) != len(self.upper_bound): support.error( "Lengths of lower and upper bounds should be equal.") if self.p_ids is not None: if len(self.p_ids) != len(self.lower_bound): print self.p_ids print self.lower_bound support.error( "Lengths of lower, upper bounds and parameters identificators should be equal.") if self.initial_structure is not None: if len(self.initial_structure ) != self.number_of_populations: support.error("wrong length of initial model structure: must be " + str(self.number_of_populations)) for n in self.initial_structure: if n < 0: support.error('elements in comma-separated list ' + ','.join( str(x) for x in self.initial_structure) + ' must be positive (`Initial structure` parameter)') if self.final_structure is not None: if len(self.final_structure ) != self.number_of_populations: support.error("Wrong length of final model structure: must be " + str(self.number_of_populations)) for n in self.final_structure: if n < 0: support.error('Elements in comma-separated list ' + ','.join( self.final_structure) + ' must be positive (`Final structure` parameter)') if not (self.final_structure >= self.initial_structure).all(): support.error( "Final structure must be greater than initial structure") if self.split_1_lim is not None and self.split_2_lim is not None and not self.split_1_lim > self.split_2_lim: support.error( "Upper bound of first split must be greater than upper bound of second split") if self.size_of_generation <= 0: support.error( "Size of population (Parameters of genetic algorithm) must be positive" ) if self.mutation_strength > 1 or self.mutation_strength < 0: support.error( "Mutation strength (Paramters of genetic algorithm) must be between 0 and 1" ) if self.mutation_rate > 1 or self.mutation_rate < 0: support.error( "Mutation rate (Parameters of genetic algorithm) must be between 0 and 1" ) if self.const_for_mut_rate < 1 or self.const_for_mut_rate > 2: support.error( "Const for adaptive mutation rate (Parameters of genetic algorithm) must be between 1 and 2" ) if self.const_for_mut_strength < 1 or self.const_for_mut_rate > 2: support.error( "Const for adaptive mutation strength (Parameters of genetic algorithm) must be between 1 and 2" ) if self.dadi_pts is not None: for n in self.dadi_pts: if n < 0: support.error('elements in comma-separated list ' + ','.join(str(x) for x in self.dadi_pts) + ' must be positive (Pts parameter)') if self.repeats <= 0: support.error("Repeats (Parameters of pipeline) must be positive") if self.processes <= 0: support.error( "Processes (Parameters of pipeline) must be positive") if self.number_of_populations < 3 and self.split_2_lim is not None: support.warning("There is no second split in case of " + str(self.number_of_populations) + " populations. Upper bound for it will be ignored.") self.split_2_lim = None if self.number_of_populations < 2 and self.split_1_lim is not None: support.warning( "There is no first split in case of 1 populations. Upper bound for it will be ignored.") self.split_1_lim = None if self.moments_scenario: if pkgutil.find_loader('moments') is None: if self.model_func_file is not None: support.error("moments is not installed. You tried to use custom model and moments.") if pkgutil.find_loader('dadi') is not None: options_storage.moments_scenario = False support.warning("moments is not installed, dadi with " + str(self.dadi_pts) +"grid size will be used instead.") else: support.error("None of the dadi or the moments are installed.") else: if pkgutil.find_loader('dadi') is None: if self.model_func_file is not None: support.error("dadi is not installed. You tried to use custom model and moments.") if pkgutil.find_loader('moments') is not None: options_storage.moments_scenario = True support.warning("dadi is not installed, moments will be used instead.") else: support.error("None of the dadi or the moments are installed.") packages = [] self.matplotlib_available = pkgutil.find_loader('matplotlib') is not None if not self.matplotlib_available: packages.append('matplotlib') # If custom model and dadi is used we can ignore PIL absence if self.model_func_file is None or self.moments_scenario: self.pil_available = pkgutil.find_loader('PIL') is not None if not self.pil_available: packages.append('Pillow') self.moments_available = pkgutil.find_loader('moments') is not None if not self.moments_available: packages.append('moments') if not self.matplotlib_available: support.warning( "To draw models and SFS plots you should install: " + ', '.join(packages)) elif not self.pil_available and self.moments_available: support.warning( "To draw concatenated plots you should install: Pillow") elif not self.moments_available: support.warning( "To draw models plots you should install: " + ', '.join(packages)) if self.optimize_name == 'optimize_powell' and not self.moments_scenario: if not self.moments_available: support.warning( "To use Powell optimization one need moments installed. BFGS (optimize_log) will be used instead.") self.optimize_name = 'optimize_log' if self.distribution != 'normal' and self.distribution != 'uniform': support.error( "Distribution in extra parameters must be `normal` or `uniform`.") if self.distribution == 'uniform' and self.std is not None: support.warning( 'Std in extra parameters will be ignored as uniform distribution was chosen.')
def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) used_K = [] # checking and removing conflicting K-mer directories if options_storage.restart_from and (options_storage.restart_k_mers != options_storage.original_k_mers): processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile( os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[ -1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info( "Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="spades_") finished_on_stop_after = False K = cfg.iterative_K[0] if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, K, None, True) used_K.append(K) else: run_iteration(configs_dir, execution_home, cfg, log, K, None, False) used_K.append(K) if options_storage.stop_after == "k%d" % K: finished_on_stop_after = True else: prev_K = K RL = get_read_length(cfg.output_dir, K, ext_python_modules_home, log) cfg.iterative_K = update_k_mers_in_special_cases( cfg.iterative_K, RL, log) if len(cfg.iterative_K) < 2 or cfg.iterative_K[1] + 1 > RL: if cfg.rr_enable: if len(cfg.iterative_K) < 2: log.info( "== Rerunning for the first value of K (%d) with Repeat Resolving" % cfg.iterative_K[0]) else: support.warning( "Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning for the first value of K (%d) with Repeat Resolving" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) used_K.append(cfg.iterative_K[0]) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len(cfg.iterative_K) or ( rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one) used_K.append(K) prev_K = K if last_one: break if options_storage.stop_after == "k%d" % K: finished_on_stop_after = True break if count < len(cfg.iterative_K) and not finished_on_stop_after: support.warning( "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) if options_storage.stop_after and options_storage.stop_after.startswith( 'k'): support.finish_here(log) latest = os.path.join(cfg.output_dir, "K%d" % K) if cfg.correct_scaffolds and not options_storage.run_completed: if options_storage.continue_mode and os.path.isfile( os.path.join(cfg.output_dir, "SCC", "corrected_scaffolds.fasta" )) and not options_storage.restart_from == "scc": log.info("\n===== Skipping %s (already processed). \n" % "scaffold correction") else: if options_storage.continue_mode: support.continue_from_here(log) run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, 21) latest = os.path.join(os.path.join(cfg.output_dir, "SCC"), "K21") if options_storage.stop_after == 'scc': support.finish_here(log) if cfg.correct_scaffolds: correct_scaffolds_fpath = os.path.join(latest, "corrected_scaffolds.fasta") if os.path.isfile(correct_scaffolds_fpath): shutil.copyfile(correct_scaffolds_fpath, cfg.result_scaffolds) elif not finished_on_stop_after: # interupted by --stop-after, so final K is not processed! if os.path.isfile(os.path.join(latest, "before_rr.fasta")): result_before_rr_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "before_rr.fasta") if not os.path.isfile(result_before_rr_contigs ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr.fasta"), result_before_rr_contigs) if options_storage.rna: if os.path.isfile(os.path.join(latest, "transcripts.fasta")): if not os.path.isfile(cfg.result_transcripts ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts) if os.path.isfile(os.path.join(latest, "transcripts.paths")): if not os.path.isfile(cfg.result_transcripts_paths ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.paths"), cfg.result_transcripts_paths) for filtering_type in options_storage.filtering_types: prefix = filtering_type + "_filtered_" result_filtered_transcripts = os.path.join( cfg.output_dir, prefix + options_storage.transcripts_name) latest_filtered_transcripts = os.path.join( latest, prefix + "final_paths.fasta") if os.path.isfile(latest_filtered_transcripts): if not os.path.isfile( result_filtered_transcripts ) or not options_storage.continue_mode: shutil.copyfile(latest_filtered_transcripts, result_filtered_transcripts) else: if os.path.isfile(os.path.join(latest, "final_contigs.fasta")): if not os.path.isfile(cfg.result_contigs ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs) if os.path.isfile(os.path.join(latest, "first_pe_contigs.fasta")): result_first_pe_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "first_pe_contigs.fasta") if not os.path.isfile(result_first_pe_contigs ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "first_pe_contigs.fasta"), result_first_pe_contigs) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds.fasta")): if not os.path.isfile( cfg.result_scaffolds ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds) if os.path.isfile(os.path.join(latest, "scaffolds.paths")): if not os.path.isfile( cfg.result_scaffolds_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths) if os.path.isfile( os.path.join(latest, "assembly_graph_with_scaffolds.gfa")): if not os.path.isfile(cfg.result_graph_gfa ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "assembly_graph_with_scaffolds.gfa"), cfg.result_graph_gfa) if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")): if not os.path.isfile( cfg.result_graph) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph) if os.path.isfile(os.path.join(latest, "final_contigs.paths")): if not os.path.isfile(cfg.result_contigs_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "final_contigs.paths"), cfg.result_contigs_paths) if cfg.developer_mode: # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists( saves_link ): # exists returns False for broken links! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return used_K
def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) # checking and removing conflicting K-mer directories if options_storage.restart_from: processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile( os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[ -1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info( "Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="spades_") if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) K = cfg.iterative_K[0] else: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, False) prev_K = cfg.iterative_K[0] RL = get_read_length(cfg.output_dir, cfg.iterative_K[0], ext_python_modules_home, log) cfg.iterative_K = update_k_mers_in_special_cases( cfg.iterative_K, RL, log) if cfg.iterative_K[1] + 1 > RL: if cfg.rr_enable: support.warning( "Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning for the first value of K (%d) with Repeat Resolving" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len( cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one) prev_K = K if last_one: break if count < len(cfg.iterative_K): support.warning( "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) latest = os.path.join(cfg.output_dir, "K%d" % K) for format in [".fasta", ".fastg"]: if os.path.isfile(os.path.join(latest, "before_rr" + format)): result_before_rr_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "before_rr" + format) if not os.path.isfile(result_before_rr_contigs ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr" + format), result_before_rr_contigs) if os.path.isfile(os.path.join(latest, "final_contigs" + format)): if not os.path.isfile(cfg.result_contigs[:-6] + format) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "final_contigs" + format), cfg.result_contigs[:-6] + format) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds" + format)): if not os.path.isfile(cfg.result_scaffolds[:-6] + format ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds" + format), cfg.result_scaffolds[:-6] + format) if cfg.developer_mode: # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists( saves_link ): # exists return False for broken link! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return latest