Esempio n. 1
0
def register_rpchandler(cmd, rpchandler):
    """ register_rpchandler(cmd, rpchandler):

    An RPC handler is registered. The RPC handler is used to
    receive messages from network. Specifically, if an incoming
    TCP connection begins with a line containing 'cmd' string,
    rpchandler is called by rpchandler(data, eof, socket,
    address), where 'data' is the data received so far in the
    socket, 'eof' indicates whether there is more data still
    coming (if 'eof' == False, data can be partial).  'socket' is
    the TCP socket for remote, and 'address' is the network
    address of the remote end (ip address string, remote port).

    rpchandler() can be called many times until eof is True or more data
    comes. This allows incremental processing of incoming message
    (e.g. terminate invalid messages early).

    rpchandler() returns a value that is one of:

    RPC_MORE_DATA: the handler will want more data
    RPC_CLOSE:     the handler asks main handler to terminate the connection
    RPC_RELEASE:   the handler will take care of the socket from now on
    """

    if rpc_commands.has_key(cmd):
        warning('Can not install RPC handler: %s already exists\n' % cmd)
        return False
    rpc_commands[cmd] = rpchandler
    return True
Esempio n. 2
0
    def get_expiring_file(self, dt=None, rel=None):
        """ Create a temp file, which expires at a given time. The temp file
            is stored under user's proximate directory. The file will expire
            (be deleted) after the given time. The actual deletion time is
            not very accurate.

            dt is a point in time, which is an instance of datetime.datetime.
            If dt == None, it is assumed to be now. If rel == None,
            it is assumed to be zero. Otherwise it is assumed to be a
            relative delay with respect to dt.
            rel is an instance of datetime.timedelta.

            Hint: Use scheduler.DAY and scheduler.SECOND to specify relative
            times """

        assert(dt == None or isinstance(dt, datetime))
        assert(rel == None or isinstance(rel, timedelta))

        if dt == None:
            dt = datetime.now()
        if rel != None:
            dt = dt + rel
        # ISO date: YYYY-MM-DD-s, where s is a number of seconds in the day
        isodate = str(dt.date())
        seconds = str(dt.hour * 3600 + dt.minute * 60 + dt.second)
        prefix = '%s-%s-%s-' % (self.EXPIRE_PREFIX, isodate, seconds)
        directory = self.community.get_user_dir()
        try:
            (fd, fname) = mkstemp(prefix=prefix, dir=directory)
        except OSError:
            warning('expiring_file: mkstemp() failed\n')
            return None
        xclose(fd)
        return fname
Esempio n. 3
0
def rna_k_values(support, dataset_data, log):
    rna_rl = support.get_reads_length(dataset_data, log, ["merged reads"])
    upper_k = int(rna_rl / 2) - 1
    if upper_k % 2 == 0:
        upper_k -= 1

    lower_k = min(max(int(rna_rl / 3), options_storage.RNA_MIN_K), options_storage.RNA_MAX_LOWER_K)
    if lower_k % 2 == 0:
        lower_k -= 1

    use_iterative = True
    if upper_k <= lower_k:
        use_iterative = False

    if upper_k < options_storage.RNA_MIN_K:
        support.warning("\nauto K value (%d) is too small, recommended to be at least %d.\n" % (upper_k, options_storage.RNA_MIN_K))
        if rna_rl <= options_storage.RNA_MIN_K:
            support.warning(
                "read length is too small (%d), but keeping current K value anyway. Consider setting K manually. K\n" % (
                    rna_rl))
        else:
            upper_k = options_storage.RNA_MIN_K
        log.info("Upper K value is set to %d.\n" % (upper_k))

    if upper_k > options_storage.MAX_K:
        log.info("\nAuto K value (%d) is too large, all K values should not exceed %d. Setting k=%d.\n"
                 % (upper_k, options_storage.MAX_K, options_storage.MAX_K))
        upper_k = options_storage.MAX_K

    if not use_iterative:
        return [upper_k]
    return [lower_k, upper_k]
Esempio n. 4
0
 def gen_key_pair_priv_cb(data, ctx):
     if not data:
         warning('keymanagement: Could not generate a key pair\n')
         cb(None, None)
     else:
         xrun([self.sslname, 'rsa', '-pubout'], gen_key_pair_pub_cb,
             data, data)
Esempio n. 5
0
 def send_lowlevel(self, user, data):
     ip = user.get('ip')
     port = user.get('port')
     if ip == None or port == None:
         warning('fetcher: No ip/port to open %s\n' % (user.tag()))
         return
     send_broadcast(ip, port, data)
Esempio n. 6
0
    def add_msg(self, msg, set_head=False):
        parentid = msg.get_parentid()
        msgid = msg.get_msgid()

        if msgid in self.msgdict:
            warning('add_msg(): Attempted to add same message twice\n')
            return

        # add to msgdict
        self.msgdict[msgid] = msg

        # update children-list of parent
        if parentid != '':
            # Create a new list of children if it does not exist
            parent_children = self.childdict.setdefault(parentid, [])
            parent_children.append(msgid)

        has_parent = self.msgdict.has_key(parentid)
        children = self.childdict.get(msgid, [])

        # if parent of this node is not in msgdict, this is new root node
        if not has_parent:
            self.roots.append(msgid)

        # join trees by removing roots of child trees
        for childid in children:
            self.roots.remove(childid)

        if set_head:
            self.headid = msgid
Esempio n. 7
0
    def got_community_profiles(self, user, reply, ctx):
        if reply == None:
            return

        validator = {
            'cname': [ZERO_OR_MORE, str],
            'profile': [ZERO_OR_MORE, {}]
           }
        if not validate(validator, reply):
            warning('Invalid community profiles reply\n' % str(reply))
            return

        communities = self.get_user_communities(user)

        for (cname, profile) in zip(reply['cname'], reply['profile']):
            if cname == DEFAULT_COMMUNITY_NAME:
                continue
            com = self.get_ordinary_community(cname)
            if com in communities:
                self.update_community_profile(com, user, profile)
                communities.remove(com)

        # Do icon requests for the rest of communities
        for com in communities:
            if com.get('name') != DEFAULT_COMMUNITY_NAME:
                self.request_com_icon(user, com)
Esempio n. 8
0
def save_community_icon(com, icon):
    # personal communities can have arbitary large icons because the picture
    # is not sent over network
    if com.get('peer') and len(icon) > TP_MAX_FACE_SIZE:
        warning('Community %s has too large icon picture: %d\n' %(com.get('name'), len(icon)))
        return False
    return save_image(get_community_icon_name(com, legacyname=False), icon)
Esempio n. 9
0
def xmkdir(dirname, mode = 0700):
    try:
        mkdir(dirname, mode)
    except OSError, (errno, strerror):
        if errno != EEXIST:
            warning('Can not create a directory: %s\n' %(dirname))
            return False
def check_dir_is_empty(dir_name):
    if dir_name is not None and \
            os.path.exists(dir_name) and \
            os.listdir(dir_name):
        support.warning(
            "output dir is not empty! Please, clean output directory before run."
        )
Esempio n. 11
0
def init():
    """ Bind a default and a random port.
        The random port is used for local network communication.
        The default port is used to establish remote connections. """

    global community
    community = get_plugin_by_type(PLUGIN_TYPE_COMMUNITY)

    create_tcp_listener(DEFAULT_PROXIMATE_PORT, tcp_listener_accept, reuse=True)

    success = False
    for i in xrange(PORT_RETRIES):
        port = community.get_rpc_port()
        if port == DEFAULT_PROXIMATE_PORT:
            continue
        (rfd, tag) = create_tcp_listener(port, tcp_listener_accept, reuse=True)
        if rfd != None:
            info('Listening to TCP connections on port %d\n' %(port))
            success = True
            break
        warning('Can not bind to TCP port %d\n' %(port))
        # Generate a new port number so that next iteration will not fail
        if not community.gen_port():
            break

    if not success:
        warning('Can not listen to TCP connections\n')
Esempio n. 12
0
    def handle_rpc_message(self, data, eof):
        if len(data) == 0:
            self.close()
            return False

        cmd = data[0:TP_MAX_CMD_NAME_LEN].split('\n')[0]

        rpchandler = rpc_commands.get(cmd)
        if rpchandler == None:
            self.close()
            return False

        payload = data[(len(cmd) + 1):]

        status = rpchandler(cmd, payload, eof, self.sock, self.address)
        ret = False
        if status == RPC_MORE_DATA:
            ret = True
        elif status == RPC_CLOSE:
            self.close()
        elif status == RPC_RELEASE:
            # We are not interested to gobject events anymore
            self.remove_io_notifications()
        else:
            self.close()
            warning('Unknown RPC value: %s\n' %(str(status)))
        return ret
Esempio n. 13
0
 def chat_cb(self, widget):
     uid = self.msg.get('src')
     user = community.get_user(uid)
     if user == community.get_myself():
         warning('Trying to chat with yourself')
         return None
     chat.messaging_gui.start_messaging(user, False)
Esempio n. 14
0
def read_communities():
    global communities
    if proximatedir == None:
        warning('No Proximate directory\n')
        return

    # Read community meta datas
    for dentry in os.listdir(proximatedir):
        if not dentry.startswith('c_'):
            continue
        if str_to_int(dentry[2:], None) == None:
            continue
        cdir = '%s/%s' %(proximatedir, dentry)
        if not os.path.isdir(cdir):
            continue
        cfile = '%s/profile' %(cdir)

        community = Community()
        try:
            f = open(cfile, 'r')
        except IOError:
            continue
        profile = f.read()
        f.close()
        if community.read_profile(profile):
            communities[community.get('cid')] = community

    defcom = get_ordinary_community(DEFAULT_COMMUNITY_NAME)
    if defcom == None:
        create_community(DEFAULT_COMMUNITY_NAME)
Esempio n. 15
0
    def add_or_update_user(self, uid, updatelist, profileversion, ip, port, profile=None):
        user = get_user(uid)
        newuser = (user == None)
        if newuser:
            user = create_user(uid)
            if not user:
                warning('community: Unable to create a new user %s\n' % uid)
                return

        if ip != None:
            user.set('ip', ip)
            user.set('port', port)

        if newuser or user.get('v') != profileversion:
            user.update_attributes(updatelist, user.get('v'))

            if profile != None:
                self.got_user_profile(user, profile, None)
            elif not user.inprogress:
                debug('Fetching new profile from user %s\n' % user.tag())
                request = {'t': 'uprofile'}
                if self.fetcher.fetch(user, PLUGIN_TYPE_COMMUNITY, request, self.got_user_profile):
                    user.inprogress = True

        elif not user.present and not user.inprogress:
            # User appears and user profile is already up-to-date
            self.request_user_icon(user)
            self.fetch_community_profiles(user)

        if user.update_presence(True):
            self.announce_user(user)
Esempio n. 16
0
    def connect(self):
        ip = self.user.get('ip')
        port = self.user.get('port')

        if not community.get_network_state(community.IP_NETWORK):
            # Act as if we were missing the IP network
            warning('fetcher: IP network disabled\n')
            ip = None

        if ip == None or port == None:
            warning('fetcher: No ip/port to open %s\n' % (self.user.tag()))
            return False

        debug('fetcher: open from %s: %s:%s\n' % (self.user.tag(), ip, port))

        if self.openingconnection == False or self.q.connect((ip, port), TP_CONNECT_TIMEOUT) == False:
            return False

        # The first write is seen by opposite side's RPC hander, not TCP_Queue
        prefix = '%s\n' %(TP_FETCH_RECORDS)
        self.q.write(prefix, writelength=False)

        self.q.write(fetcher.encode(firstmsg, -1, ''))

        # Close queue that is idle for a period of time. This is also the
        # maximum processing time for pending requests. Requests taking
        # longer than this must use other state tracking mechanisms.
        self.q.set_timeout(TP_FETCH_TIMEOUT)
        return True
Esempio n. 17
0
    def __init__(self):
        DBusGMainLoop(set_as_default=True)

        self.bus = SystemBus()
        self.sessionbus = SessionBus()
        try:
            self.mce = self.bus.get_object("com.nokia.mce", "/com/nokia/mce")
        except DBusException:
            warning("Nokia MCE not found. Vibra is disabled\n")
            return

        self.profiled = self.sessionbus.get_object("com.nokia.profiled", "/com/nokia/profiled")

        self.sessionbus.add_signal_receiver(
            self.profile_changed_handler,
            "profile_changed",
            "com.nokia.profiled",
            "com.nokia.profiled",
            "/com/nokia/profiled",
        )

        profile = self.profiled.get_profile(dbus_interface="com.nokia.profiled")
        self.get_vibra_enabled(profile)

        self.register_plugin(PLUGIN_TYPE_VIBRA)
Esempio n. 18
0
def xrun(cmd, cb, ctx, inputdata=None):
    """ Run 'cmd' (a list of command line arguments). Call cb(data, ctx),
    when the command finishes with its output given to cb() at parameter
    'data'. If 'inputdata' is given, feed it to the command from stdin.

    The result is relayed through the gobject mainloop.
    """

    (rfd, wfd) = xpipe()
    if rfd < 0:
        return False
    pid = xfork()
    if pid == -1:
        warning('Could not fork a new process\n')
        return False
    if pid != 0:
        xclose(wfd)
        return xrunwatch(rfd, cb, ctx, pid)

    xclose(rfd)

    w = fdopen(wfd, 'w')

    try:
        pipe = Popen(cmd, stdout=PIPE, stdin=PIPE)
    except OSError:
        warning('Unable to run %s\n' %(' '.join(cmd)))
        w.write('-1\0')
        abort()

    w.write(str(pipe.pid) + '\0')
    w.flush()

    if inputdata:
        try:
            pipein = pipe.stdin
            pipein.write(inputdata)
        except IOError:
            warning("IOError while writing to command %s\n" %(' '.join(cmd)))
            abort()
        pipein.close()

    try:
        pipeout = pipe.stdout
        result = pipeout.read()
    except IOError:
        warning("IOError while reading from command %s\n" %(' '.join(cmd)))
        abort()
    pipeout.close()

    pipe.wait()

    if pipe.returncode != 0:
        warning('%s did not exit cleanly\n' %(' '.join(cmd)))
        abort()

    w.write(result)
    w.close()
    abort()
Esempio n. 19
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1:
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        shutil.copytree(os.path.join(configs_dir, "debruijn"), dst_configs)
        # removing template configs
        for root, dirs, files in os.walk(dst_configs):
            for cfg_file in files:
                cfg_file = os.path.join(root, cfg_file)
                if cfg_file.endswith('.info.template'):
                    if os.path.isfile(cfg_file.split('.template')[0]):
                        os.remove(cfg_file)
                    else:
                        os.rename(cfg_file, cfg_file.split('.template')[0])

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one)

    command = [os.path.join(execution_home, "spades"), cfg_file_name]

## this code makes sense for src/debruijn/simplification.cpp: corrected_and_save_reads() function which is not used now
#    bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads")
#    if os.path.isdir(bin_reads_dir):
#        if glob.glob(os.path.join(bin_reads_dir, "*_cor*")):
#            for cor_filename in glob.glob(os.path.join(bin_reads_dir, "*_cor*")):
#                cor_index = cor_filename.rfind("_cor")
#                new_bin_filename = cor_filename[:cor_index] + cor_filename[cor_index + 4:]
#                shutil.move(cor_filename, new_bin_filename)
    support.sys_call(command, log)
Esempio n. 20
0
def create_user_communities(user):
    for cname in user.get('communities'):
        if not valid_community(cname):
            warning('Invalid community: %s\n' %(cname))
            continue
        if get_ordinary_community(cname) == None:
            community = create_community(cname)
            save_communities([community])
Esempio n. 21
0
def tcp_listener_accept(rfd, conditions):
    try:
        (sock, address) = rfd.accept()
    except socket.error, (errno, strerror):
        ret = (errno == EAGAIN or errno == EINTR)
        if not ret:
            warning('Listener: Socket error (%s): %s\n' % (errno, strerror))
        return ret
Esempio n. 22
0
    def handle_message(self, user, sm):
        """ Handle messages that were found from other users' fileshares """

        if not self.validate_message(sm):
            sm["ttl"] = 0
            warning("msgboard: Invalid message: %s\n" % str(sm))
            return
        warning("New message: %s\n" % sm["subject"])
Esempio n. 23
0
 def udp_listener_read(self, rfd, condition):
     try:
         data, address = rfd.recvfrom(2048)
     except socket.error, (errno, strerror):
         ret = (errno == EAGAIN or errno == EINTR)
         if not ret:
             warning('Socket error (%s): %s\n' % (errno, strerror))
         return ret
Esempio n. 24
0
 def sym_enc(self, plain, passphrase):
     """ Encrypts message with AES using given passphrase """
     ciph = xsystem([self.sslname, self.symmetric, '-e', '-pass', 'stdin'],
         passphrase + '\n' + plain)
     if not ciph:
         warning('keymanagement: Unable to perform symmetric encryption\n')
         return None
     return ciph
Esempio n. 25
0
    def find_significant_features(self):
        """
        Finds features in the face, gets their scores and filters the least significant ones.
        In the end, self.s_features will have a list of significant feature names, ordered from high to low.
        """
        # Load the heat map and blend with user image
        self._load_heat_map()
        self._create_blended()

        # Find signiicant features
        feature_coords = dict()
        for feature in FEATURES_CASCADES:
            feature_coords[feature] = self._find_feature(
                feature, FEATURES_CASCADES[feature])

        if not len(feature_coords):
            self.err_code = 6
            self.err_msg = "No significant features found."
            warning(self.err_msg)

        # Find eyebrows and forehead if eyes were found
        if len(feature_coords['eyes']) == 2:
            eyebrows = self._find_eyebrows(feature_coords['eyes'])
            # If found, find forehead and adjust eyes
            if len(eyebrows) == 2:
                x_eb_l, y_eb_l, w_eb_l, h_eb_l = eyebrows[0]
                x_eb_r, y_eb_r, w_eb_r, h_eb_r = eyebrows[1]
                y_eb = min(y_eb_l, y_eb_r)
                x_eb = x_eb_l
                w_eb = x_eb_r + w_eb_r - x_eb_l
                # Forehead
                feature_coords['forehead'] = [(x_eb, 0, w_eb, y_eb)]
                # Subtract half of eyebrows height from eyes
                x_e_l, y_e_l, w_e_l, h_e_l = feature_coords['eyes'][0]
                x_e_r, y_e_r, w_e_r, h_e_r = feature_coords['eyes'][1]
                h_e_l = y_e_l + h_e_l - int(y_eb_l + h_eb_l / 2)
                h_e_r = y_e_r + h_e_r - int(y_eb_r + h_eb_r / 2)
                y_e_l = int(y_eb_l + h_eb_l / 2)
                y_e_r = int(y_eb_r + h_eb_r / 2)
                feature_coords['eyes'] = [(x_e_l, y_e_l, w_e_l, h_e_l),
                                          (x_e_r, y_e_r, w_e_r, h_e_r)]
                feature_coords['eyebrows'] = eyebrows

        for feature in feature_coords:
            coords = feature_coords[feature]
            # Feature not found
            if not len(coords):
                continue
            debug("Feature coords: {}".format(coords))
            self.s_features.append((feature, self._get_feature_score(coords)))
        # Sort features from highest to lowest
        self.s_features.sort(reverse=True, key=lambda f: f[1][-1])
        debug("Found features before filtering: {}".format(self.s_features))
        # Filter out un-distinct features and leave only names
        self.s_features = [
            feature[0] for feature in self.s_features
            if self._is_distinct_feature(feature[1])
        ]
Esempio n. 26
0
def open_to(ptype, l, fullname):
    (exe, f) = find_exe(l)
    if exe == None:
        warning('Can not open a %s\n' %(ptype))
        return False
    warning('Open %s to %s %s\n' %(fullname, ptype, exe))
    args = f(fullname)
    os.spawnlp(os.P_NOWAIT, exe, exe, *args)
    return True
Esempio n. 27
0
 def read(self, fd, condition, this):
     try:
         chunk = self.sock.recv(4096)
     except socket.error, (errno, strerror):
         ret = (errno == EAGAIN or errno == EINTR)
         if not ret:
             warning('Listener: Read error (%s): %s\n' %(errno, strerror))
             self.close()
         return ret
Esempio n. 28
0
 def encode(self, msg, rid, rt):
     if type(msg) != dict:
         warning('fetcher: message must be a dictionary: %s\n' %(str(msg)), printstack=True)
         return None
     msg.setdefault('v', 0)
     msg.setdefault('t', '')
     msg['rid'] = rid
     msg['rt'] = rt
     return bencode(msg)
Esempio n. 29
0
 def watch(self, fd, condition):
     try:
         bytes = read(fd, 4096)
     except OSError, (errno, strerror):
         if errno == EAGAIN or errno == EINTR:
             return True
         warning('xrun: Surprising error code: %d %s\n' %(errno, strerror))
         self.finish(None, True)
         return False
Esempio n. 30
0
 def handle_community_profile_fetch(self, user, request):
     cname = request.get('cname')
     if type(cname) != str:
         warning('Invalid community profile fetch\n' % str(request))
         return None
     community = self.get_ordinary_community(cname)
     if community == None:
         return None
     return {'cprofile': community.serialize()}
Esempio n. 31
0
def read_users():
    if proximatedir == None:
        warning('No Proximate directory\n')
        return

    for dentry in os.listdir(proximatedir):
        uid = parse_user_dentry(dentry)
        if uid != None:
            read_user_profile(uid)
Esempio n. 32
0
def save_image(fname, image):
    if fname == None:
        return False
    basename = os.path.basename(fname)
    tmpname = fname + '.tmp'
    try:
        f = open(tmpname, 'w')
    except IOError, (errno, strerror):
        warning('Can not save face to %s: %s\n' %(tmpname, strerror))
        return False
Esempio n. 33
0
    def create_udp_listener(self):
        port = community.get_rpc_port()
        info('fetcher: Listening to UDP port %d\n' % port)
        rfd = create_udp_socket('', port, False, reuse = True)
        if rfd == None:
            warning('Can not listen to UDP broadcasts\n')
            return

        rfd.setblocking(False)
        io_add_watch(rfd, IO_IN, self.udp_listener_read)
Esempio n. 34
0
def move_dataset_files(dataset_data,
                       dst,
                       ext_python_modules_home,
                       max_threads,
                       log,
                       gzip=False):
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                moved_reads_files = []
                for reads_file in value:
                    dst_filename = os.path.join(dst,
                                                os.path.basename(reads_file))
                    # TODO: fix problem with files with the same basenames in Hammer binary!
                    if not os.path.isfile(reads_file):
                        if (not gzip and os.path.isfile(dst_filename)) or (
                                gzip and os.path.isfile(dst_filename + '.gz')):
                            support.warning(
                                'file with corrected reads (' + reads_file +
                                ') is the same in several libraries', log)
                            if gzip:
                                dst_filename += '.gz'
                        else:
                            support.error(
                                'something went wrong and file with corrected reads ('
                                + reads_file + ') is missing!', log)
                    else:
                        shutil.move(reads_file, dst_filename)
                        if gzip:
                            to_compress.append(dst_filename)
                            dst_filename += '.gz'
                    moved_reads_files.append(dst_filename)
                reads_library[key] = moved_reads_files
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([
                    pigz_path, '-f', '-7', '-p',
                    str(max_threads), reads_file
                ], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(
                delayed(support.sys_call)(['gzip', '-f', '-7', reads_file])
                for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Esempio n. 35
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1 \
                and options_storage.restart_from.startswith("k%d:" % K):
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        dir_util._path_created = {}  # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree
        dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        #FIXME why here???
        process_cfg.substitute_params(os.path.join(dst_configs, "construction.info"), {"read_buffer_size": cfg.read_buffer_size}, log)
    if "scaffolding_mode" in cfg.__dict__:
        #FIXME why here???
        process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log)

    prepare_config_rnaspades(os.path.join(dst_configs, "rna_mode.info"), log)
    prepare_config_construction(os.path.join(dst_configs, "construction.info"), log)
    cfg_fn = os.path.join(dst_configs, "config.info")
    prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)

    command = [os.path.join(execution_home, "spades-core"), cfg_fn]

    add_configs(command, dst_configs)

    #print("Calling: " + " ".join(command))
    support.sys_call(command, log)
Esempio n. 36
0
def update_k_mers_in_special_cases(cur_k_mers, RL, log, silent=False):
    if options_storage.auto_K_allowed():
        if RL >= 250:
            if not silent:
                support.warning("Default k-mer sizes were set to %s because estimated "
                                "read length (%d) is equal to or greater than 250" % (str(options_storage.K_MERS_250), RL), log)
            return options_storage.K_MERS_250
        if RL >= 150:
            if not silent:
                support.warning("Default k-mer sizes were set to %s because estimated "
                                "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
            return options_storage.K_MERS_150
    return cur_k_mers
Esempio n. 37
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1:
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)
        # removing template configs
        for root, dirs, files in os.walk(dst_configs):
            for cfg_file in files:
                cfg_file = os.path.join(root, cfg_file)
                if cfg_file.endswith('.info.template'):
                    if os.path.isfile(cfg_file.split('.template')[0]):
                        os.remove(cfg_file)
                    else:
                        os.rename(cfg_file, cfg_file.split('.template')[0])

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)

    command = [os.path.join(execution_home, "spades"), cfg_file_name]
    support.sys_call(command, log)
Esempio n. 38
0
def update_k_mers_in_special_cases(cur_k_mers, RL, log):
    if not options_storage.k_mers and not options_storage.single_cell:  # kmers were set by default and not SC
        if RL >= 250:
            support.warning(
                "Default k-mer sizes were set to %s because estimated "
                "read length (%d) is equal or great than 250" %
                (str(options_storage.k_mers_250), RL), log)
            return options_storage.k_mers_250
        if RL >= 150:
            support.warning(
                "Default k-mer sizes were set to %s because estimated "
                "read length (%d) is equal or great than 150" %
                (str(options_storage.k_mers_150), RL), log)
            return options_storage.k_mers_150
    return cur_k_mers
Esempio n. 39
0
    def from_file(self, input_filename):
        with open(input_filename) as f:
            line_number = 0
            for line in f:
                line = line.strip()
                pos = line.find('#')
                if pos != -1:
                    line = line[:pos]
                if line == '':
                    continue

                if len(line.split(':')) > 2:
                    support.warning(
                        "In parameters file in line number " +
                        str(line_number) +
                        " two ':'. May be error? Everything after second ':' was ignored.")

                identity = line.split(':')[0].strip().lower()
                value = line.split(':')[1].strip()
                if identity == 'output directory':
                    self.output_dir = value
                elif identity == 'resume from':
                    self.resume_dir = value if value.lower() != 'none' else None
                elif identity == 'only models':
                    if value.lower() == 'true':
                        self.only_models = True
                    elif  value.lower() == 'false':
                        self.only_models = False
                    else:
                        self.only_models = None
                elif identity == 'input file':
                    self.input_file = value
                elif identity == 'population labels':
                    self.pop_labels = value if value.lower() != 'none' else None
                elif identity == 'projections':
                    self.ns = value if value.lower() != 'none' else None
                elif identity == 'theta0':
                    self.theta = float(
                        value) if value.lower() != 'none' else None
                elif identity == 'time for generation':
                    self.gen_time = float(
                        value) if value.lower() != 'none' else None
                elif identity == 'multinom':
                    self.multinom = True if value.lower() == 'true' else False
                elif identity == 'initial structure':
                    self.initial_structure = value
                elif identity == 'final structure':
                    self.final_structure = value
                elif identity == 'relative parameters':
                    self.relative_params = value.lower() == 'true'
                elif identity == 'no migrations':
                    self.no_mig = value.lower() == 'true'
                elif identity == 'size of population in ga':
                    self.size_of_generation = int(value)
                elif identity == 'fractions in ga':
                    self.fracs = value
                elif identity == 'mean mutation strength':
                    self.mutation_strength = float(value)
                elif identity == 'mean mutation rate':
                    self.mutation_rate = float(value)
                elif identity == 'const for mutation rate':
                    self.const_for_mut_rate = float(value)
                elif identity == 'const for mutation strength':
                    self.const_for_mut_strength = float(value)
                elif identity == 'epsilon':
                    self.epsilon = float(value)
                elif identity == 'stop iteration':
                    self.stop_iter = int(value)
                elif identity == 'pts':
                    self.dadi_pts = value if value.lower() != 'none' else None
                elif identity == 'use moments or dadi':
                    if value == 'moments':
                        self.moments_scenario = True
                    else:
                        self.moments_scenario = False
                elif identity == 'draw models every n iteration':
                    self.draw_iter = int(value)
                elif identity == "print models' code every n iteration":
                    self.code_iter = int(value)
                elif identity == 'units of time in drawing':
                    if value.lower() == 'years':
                        self.gen_time_units = 1
                    elif value.lower() == 'kya' or value.lower() == 'thousand years':
                        self.gen_time_units = 1000
                    else:
                        support.warning(
                        'Cannot understand units of time in line ' +\
                                str(line_number) + ' in parameters file. Years were taken.')
                elif identity == 'silence':
                    self.silence = value.lower() == 'true'
                elif identity == 'number of repeats':
                    self.repeats = int(value)
                elif identity == 'number of processes':
                    self.processes = int(value)
                elif identity == 'upper bound of first split':
                    self.split_1_lim = float(
                        value) if value.lower() != 'none' else None
                elif identity == 'upper bound of second split':
                    self.split_2_lim = float(
                        value) if value.lower() != 'none' else None
                elif identity == 'name of local optimization':
                    self.optimize_name = value if value.lower() != 'none' else None
                    names = [
                        'optimize',
                        'optimize_log',
                        'optimize_powell',
                        'optimize_lbfgsb',
                        'optimize_log_lbfgsb',
                        'optimize_log_fmin',
                        'hill_climbing']
                    if value not in names:
                        support.error(
                            "Can't parse name of local search. Acceptable names are: " +
                            ', '.join(names))
                    else:
                        pass
                # now extra args
                elif identity == 'min_n':
                    self.min_N = float(value)
                elif identity == 'max_n':
                    self.max_N = float(value)
                elif identity == 'min_t':
                    self.min_T = float(value)
                elif identity == 'max_t':
                    self.max_T = float(value)
                elif identity == 'min_m':
                    self.min_M = float(value)
                elif identity == 'max_m':
                    self.max_M = float(value)
                elif identity == 'verbose':
                    self.ls_verbose = None if value.lower() == 'none' else int(value)
                elif identity == 'flush delay':
                    self.ls_flush_delay = float(value)
                elif identity == 'epsilon for ls':
                    self.ls_epsilon = float(value)
                elif identity == 'gtol':
                    self.ls_gtol = float(value)
                elif identity == 'maxiter':
                    self.ls_maxiter = None if value.lower() == 'none' else int(value)
                elif identity == 'mean mutation rate for hc':
                    self.hc_mutation_rate = None if value.lower() == 'none' else float(value)
                elif identity == 'const for mutation rate for hc':
                    self.hc_const_for_mutation_rate = None if value.lower() == 'none' else float(value)
                elif identity == 'stop iteration for hc':
                    self.hc_stop_iter = None if value.lower() == 'none' else float(value)
                elif identity == 'multinomial mutation':
                    self.multinom_mutation = value.lower() == 'true'
                elif identity == 'multinomial crossing':
                    self.multinom_mutation = value.lower() == 'true'
                elif identity == 'random n_a':
                    self.random_N_A = value.lower == 'true'
                elif identity == 'time to print summary':
                    self.time_for_print = float(value)
                elif identity == 'distribution':
                    self.distribution = value.lower()
                elif identity == 'std':
                    self.std = None if value.lower() == 'none' else float(value)
                elif identity == 'only sudden':
                    self.only_sudden = value.lower() == 'true'
                elif identity == 'custom filename':
                    self.model_func_file = value if value.lower() != 'none' else None
                elif identity == 'lower bounds':
                    self.lower_bound = value if value.lower() != 'none' else None
                elif identity == 'upper bounds':
                    self.upper_bound = value if value.lower() != 'none' else None
                elif identity == 'parameter identifiers':
                    self.p_ids = value if value.lower() != 'none' else None
                elif identity == "linked snp's" or identity == "linked snp":
                    self.linked_snp = value.lower() == 'true'
                elif identity == "unlinked snp's" or identity == "unlinked snp":
                    self.linked_snp = value.lower() == 'false'
                elif identity == 'directory with bootstrap' or identity == 'directory of bootstrap':
                    self.boot_dir = value if value.lower() != 'none' else None
                else:
                    support.error(
                        'Cannot recognize identifier: ' +
                        str(line.split(':')[0].strip()))

                line_number += 1
Esempio n. 40
0
def run_spades(configs_dir, execution_home, cfg, log):
    if not isinstance(cfg.iterative_K, list):
        cfg.iterative_K = [cfg.iterative_K]
    cfg.iterative_K = sorted(cfg.iterative_K)

    bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads")
    if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode:
        shutil.rmtree(bin_reads_dir)

    if len(cfg.iterative_K) == 1:
        run_iteration(configs_dir, execution_home, cfg, log,
                      cfg.iterative_K[0], False, True)
        K = cfg.iterative_K[0]
    else:
        run_iteration(configs_dir, execution_home, cfg, log,
                      cfg.iterative_K[0], False, False)
        RL = get_read_length(cfg.output_dir, cfg.iterative_K[0])
        cfg.iterative_K = update_k_mers_in_special_cases(
            cfg.iterative_K, RL, log)
        if cfg.iterative_K[1] + 1 > RL:
            if cfg.paired_mode:
                support.warning(
                    "Second value of iterative K (%d) exceeded estimated read length (%d). "
                    "Rerunning in paired mode for the first value of K (%d)" %
                    (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log)
                run_iteration(configs_dir, execution_home, cfg, log,
                              cfg.iterative_K[0], False, True)
                K = cfg.iterative_K[0]
        else:
            rest_of_iterative_K = cfg.iterative_K
            rest_of_iterative_K.pop(0)
            count = 0
            for K in rest_of_iterative_K:
                count += 1
                last_one = count == len(
                    cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL)
                run_iteration(configs_dir, execution_home, cfg, log, K, True,
                              last_one)
                if last_one:
                    break
            if count < len(cfg.iterative_K):
                support.warning(
                    "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)"
                    % (cfg.iterative_K[count], RL), log)

    latest = os.path.join(cfg.output_dir, "K%d" % K)

    if os.path.isfile(os.path.join(latest, "before_rr.fasta")):
        if not os.path.isfile(
                os.path.join(
                    os.path.dirname(cfg.result_contigs),
                    "before_rr.fasta")) or not options_storage.continue_mode:
            shutil.copyfile(
                os.path.join(latest, "before_rr.fasta"),
                os.path.join(os.path.dirname(cfg.result_contigs),
                             "before_rr.fasta"))
    if os.path.isfile(os.path.join(latest, "final_contigs.fasta")):
        if not os.path.isfile(
                cfg.result_contigs) or not options_storage.continue_mode:
            shutil.copyfile(os.path.join(latest, "final_contigs.fasta"),
                            cfg.result_contigs)
    if cfg.paired_mode:
        if os.path.isfile(os.path.join(latest, "scaffolds.fasta")):
            if not os.path.isfile(
                    cfg.result_scaffolds) or not options_storage.continue_mode:
                shutil.copyfile(os.path.join(latest, "scaffolds.fasta"),
                                cfg.result_scaffolds)

    if cfg.developer_mode:
        # before repeat resolver contigs
        # before_RR_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "simplified_contigs.fasta")
        # shutil.copyfile(os.path.join(latest, "simplified_contigs.fasta"), before_RR_contigs)
        # saves
        saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves")
        if os.path.lexists(
                saves_link
        ):  # exists return False for broken link! lexists return True
            os.remove(saves_link)
        os.symlink(os.path.join(latest, "saves"), saves_link)

    #    os.remove(cfg.additional_contigs)

    if os.path.isdir(bin_reads_dir):
        shutil.rmtree(bin_reads_dir)

    return latest
Esempio n. 41
0
if make_latest_symlink:
    latest_symlink = 'latest'
    if os.path.islink(latest_symlink):
        os.remove(latest_symlink)
    os.symlink(output_dir, latest_symlink)

datasets_dict = dict()

print("Analyzing datasets")
for dataset in datasets:

    try:
        dataset_data = pyyaml.load(file(dataset, 'r'))
    except pyyaml.YAMLError, exc:
        support.warning('skipping ' + dataset +
                        ': exception caught while parsing YAML file (' +
                        options_storage.dataset_yaml_filename + '):\n' +
                        str(exc))
        continue

    dataset_data = support.correct_dataset(dataset_data)
    for id, library in enumerate(dataset_data):
        print("processing lib#" + str(id) + " of " + dataset)
        basename = os.path.splitext(os.path.basename(dataset))[0]
        cur_key = basename
        i = 1
        while datasets_dict.has_key(cur_key):
            cur_key = basename + "_" + str(i)

        cur_reads = []
        for key, value in library.items():
            if key.endswith('reads'):
Esempio n. 42
0
def main(args):
    os.environ["LC_ALL"] = "C"

    if len(args) == 1:
        options_storage.usage(spades_version)
        sys.exit(0)

    log = logging.getLogger('spades')
    log.setLevel(logging.DEBUG)

    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    support.check_binaries(bin_home, log)

    # parse options and safe all parameters to cfg
    options = args
    cfg, dataset_data = fill_cfg(options, log)

    if options_storage.continue_mode:
        cmd_line, options = get_options_from_params(
            os.path.join(options_storage.output_dir, "params.txt"), args[0])
        if not options:
            support.error(
                "failed to parse command line of the previous run! Please restart from the beginning or specify another output directory."
            )
        cfg, dataset_data = fill_cfg(options, log)
        if options_storage.restart_from:
            check_cfg_for_restart_from(cfg)
        options_storage.continue_mode = True

    log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
    if options_storage.continue_mode:
        log_handler = logging.FileHandler(log_filename, mode='a')
    else:
        log_handler = logging.FileHandler(log_filename, mode='w')
    log.addHandler(log_handler)

    if options_storage.continue_mode:
        log.info(
            "\n======= SPAdes pipeline continued. Log can be found here: " +
            log_filename + "\n")
        log.info("Restored from " + cmd_line)
        if options_storage.restart_from:
            updated_params = ""
            flag = False
            for v in args[1:]:
                if v == '-o' or v == '--restart-from':
                    flag = True
                    continue
                if flag:
                    flag = False
                    continue
                updated_params += " " + v
            updated_params = updated_params.strip()
            log.info("with updated parameters: " + updated_params)
            cmd_line += " " + updated_params
        log.info("")

    params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
    params_handler = logging.FileHandler(params_filename, mode='w')
    log.addHandler(params_handler)

    if options_storage.continue_mode:
        log.info(cmd_line)
    else:
        command = "Command line:"
        for v in args:
            command += " " + v
        log.info(command)

    # special case
    if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(
            dataset_data, 'paired-end'):
        support.warning(
            'cannot perform mismatch correction without at least one paired-end library! Skipping this step.',
            log)
        del cfg["mismatch_corrector"]

    print_used_values(cfg, log)
    log.removeHandler(params_handler)

    support.check_single_reads_in_options(options, log)

    if not options_storage.continue_mode:
        log.info("\n======= SPAdes pipeline started. Log can be found here: " +
                 log_filename + "\n")

    # splitting interlaced reads and processing Ns in additional contigs if needed
    if support.dataset_has_interlaced_reads(
            dataset_data) or support.dataset_has_additional_contigs(
                dataset_data):
        dir_for_split_reads = os.path.join(options_storage.output_dir,
                                           'split_input')
        if support.dataset_has_interlaced_reads(dataset_data):
            if not os.path.isdir(dir_for_split_reads):
                os.makedirs(dir_for_split_reads)
            dataset_data = support.split_interlaced_reads(
                dataset_data, dir_for_split_reads, log)
        if support.dataset_has_additional_contigs(dataset_data):
            dataset_data = support.process_Ns_in_additional_contigs(
                dataset_data, dir_for_split_reads, log)
        options_storage.dataset_yaml_filename = os.path.join(
            options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data,
                    open(options_storage.dataset_yaml_filename, 'w'))
        cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename

    try:
        # copying configs before all computations (to prevent its changing at run time)
        tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
        if os.path.isdir(
                tmp_configs_dir) and not options_storage.continue_mode:
            shutil.rmtree(tmp_configs_dir)
        if not os.path.isdir(tmp_configs_dir):
            dir_util.copy_tree(os.path.join(spades_home, "configs"),
                               tmp_configs_dir,
                               preserve_times=False)

        corrected_dataset_yaml_filename = ''
        if "error_correction" in cfg:
            STAGE_NAME = "Read error correction"
            bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
            corrected_dataset_yaml_filename = os.path.join(
                bh_cfg.output_dir, "corrected.yaml")
            if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \
                and not options_storage.restart_from == "ec":
                log.info("\n===== Skipping %s (already processed). \n" %
                         STAGE_NAME)
            else:
                support.continue_from_here(log)

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in bh_cfg.__dict__:
                    os.environ["HEAPCHECK"] = bh_cfg.heap_check

                if os.path.exists(bh_cfg.output_dir):
                    shutil.rmtree(bh_cfg.output_dir)
                os.makedirs(bh_cfg.output_dir)

                if support.get_lib_ids_by_type(
                        dataset_data, options_storage.LONG_READS_TYPES):
                    not_used_dataset_data = support.get_libs_by_type(
                        dataset_data, options_storage.LONG_READS_TYPES)
                    to_correct_dataset_data = support.rm_libs_by_type(
                        dataset_data, options_storage.LONG_READS_TYPES)
                    to_correct_dataset_yaml_filename = os.path.join(
                        bh_cfg.output_dir, "to_correct.yaml")
                    pyyaml.dump(to_correct_dataset_data,
                                open(to_correct_dataset_yaml_filename, 'w'))
                    bh_cfg.__dict__[
                        "dataset_yaml_filename"] = to_correct_dataset_yaml_filename
                else:
                    not_used_dataset_data = None
                    bh_cfg.__dict__["dataset_yaml_filename"] = cfg[
                        "dataset"].yaml_filename

                log.info("\n===== %s started. \n" % STAGE_NAME)
                hammer_logic.run_hammer(corrected_dataset_yaml_filename,
                                        tmp_configs_dir, bin_home, bh_cfg,
                                        not_used_dataset_data,
                                        ext_python_modules_home, log)
                log.info("\n===== %s finished. \n" % STAGE_NAME)

        result_contigs_filename = os.path.join(cfg["common"].output_dir,
                                               "contigs.fasta")
        result_scaffolds_filename = os.path.join(cfg["common"].output_dir,
                                                 "scaffolds.fasta")
        misc_dir = os.path.join(cfg["common"].output_dir, "misc")
        ### if mismatch correction is enabled then result contigs are copied to misc directory
        assembled_contigs_filename = os.path.join(misc_dir,
                                                  "assembled_contigs.fasta")
        assembled_scaffolds_filename = os.path.join(
            misc_dir, "assembled_scaffolds.fasta")
        if "assembly" in cfg:
            STAGE_NAME = "Assembling"
            spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
            spades_cfg.__dict__["result_contigs"] = result_contigs_filename
            spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename

            if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
                                                  or ("mismatch_corrector" in cfg and
                                                      os.path.isfile(assembled_contigs_filename)))\
                and not options_storage.restart_from == 'as' \
                and not (options_storage.restart_from and options_storage.restart_from.startswith('k')):

                log.info("\n===== Skipping %s (already processed). \n" %
                         STAGE_NAME)
                # calculating latest_dir for the next stages
                latest_dir = support.get_latest_dir(
                    os.path.join(spades_cfg.output_dir, "K*"))
                if not latest_dir:
                    support.error(
                        "failed to continue the previous run! Please restart from previous stages or from the beginning.",
                        log)
            else:
                old_result_files = [
                    result_contigs_filename, result_scaffolds_filename,
                    assembled_contigs_filename, assembled_scaffolds_filename
                ]
                for format in [".fasta", ".fastg"]:
                    for old_result_file in old_result_files:
                        if os.path.isfile(old_result_file[:-6] + format):
                            os.remove(old_result_file[:-6] + format)

                if options_storage.restart_from == 'as':
                    support.continue_from_here(log)

                if os.path.isfile(corrected_dataset_yaml_filename):
                    dataset_data = pyyaml.load(
                        open(corrected_dataset_yaml_filename, 'r'))
                    dataset_data = support.relative2abs_paths(
                        dataset_data,
                        os.path.dirname(corrected_dataset_yaml_filename))
                if spades_cfg.disable_rr:
                    spades_cfg.__dict__["rr_enable"] = False
                else:
                    spades_cfg.__dict__["rr_enable"] = True

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in spades_cfg.__dict__:
                    os.environ["HEAPCHECK"] = spades_cfg.heap_check

                log.info("\n===== %s started.\n" % STAGE_NAME)

                # creating dataset
                dataset_filename = os.path.join(spades_cfg.output_dir,
                                                "dataset.info")
                if not os.path.isfile(
                        dataset_filename) or not options_storage.continue_mode:
                    dataset_file = open(dataset_filename, 'w')
                    import process_cfg
                    dataset_file.write(
                        "single_cell" + '\t' +
                        process_cfg.bool_to_str(cfg["dataset"].single_cell) +
                        '\n')
                    if os.path.isfile(corrected_dataset_yaml_filename):
                        dataset_file.write(
                            "reads" + '\t' + process_cfg.process_spaces(
                                corrected_dataset_yaml_filename) + '\n')
                    else:
                        dataset_file.write("reads" + '\t' +
                                           process_cfg.process_spaces(
                                               cfg["dataset"].yaml_filename) +
                                           '\n')
                    if spades_cfg.developer_mode and "reference" in cfg[
                            "dataset"].__dict__:
                        dataset_file.write("reference_genome" + '\t')
                        dataset_file.write(
                            process_cfg.process_spaces(
                                cfg["dataset"].reference) + '\n')
                    dataset_file.close()
                spades_cfg.__dict__["dataset"] = dataset_filename

                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home,
                                                     spades_cfg, dataset_data,
                                                     ext_python_modules_home,
                                                     log)

                if os.path.isdir(
                        misc_dir) and not options_storage.continue_mode:
                    shutil.rmtree(misc_dir)
                if not os.path.isdir(misc_dir):
                    os.makedirs(misc_dir)

                if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith(
                        'k'):
                    k_str = options_storage.restart_from[1:]
                    if k_str.find(":") != -1:
                        k_str = k_str[:k_str.find(":")]
                    support.error(
                        "failed to continue from K=%s because this K was not processed in the original run!"
                        % k_str, log)
                log.info("\n===== %s finished. \n" % STAGE_NAME)

            #corrector
            if "mismatch_corrector" in cfg and (
                    os.path.isfile(result_contigs_filename) or
                (options_storage.continue_mode
                 and os.path.isfile(assembled_contigs_filename))):
                STAGE_NAME = "Mismatch correction"
                to_correct = dict()
                to_correct["contigs"] = (result_contigs_filename,
                                         assembled_contigs_filename)
                if os.path.isfile(result_scaffolds_filename) or (
                        options_storage.continue_mode
                        and os.path.isfile(assembled_scaffolds_filename)):
                    to_correct["scaffolds"] = (result_scaffolds_filename,
                                               assembled_scaffolds_filename)

                # moving assembled contigs (scaffolds) to misc dir
                for assembly_type, (old, new) in to_correct.items():
                    if options_storage.continue_mode and os.path.isfile(new):
                        continue
                    for format in [".fasta", ".fastg"]:
                        if os.path.isfile(old[:-6] + format):
                            shutil.move(old[:-6] + format, new[:-6] + format)

                if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
                    (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \
                    and not options_storage.restart_from == 'mc':
                    log.info("\n===== Skipping %s (already processed). \n" %
                             STAGE_NAME)
                else:
                    if options_storage.restart_from == 'mc':
                        support.continue_from_here(log)

                    log.info("\n===== %s started." % STAGE_NAME)
                    # detecting paired-end library with the largest insert size
                    est_params_data = pyyaml.load(
                        open(os.path.join(latest_dir, "final.lib_data"), 'r'))
                    max_IS_library = None
                    for reads_library in est_params_data:
                        if reads_library['type'] == 'paired-end':
                            if not max_IS_library or float(
                                    reads_library["insert size mean"]) > float(
                                        max_IS_library["insert size mean"]):
                                max_IS_library = reads_library
                    if not max_IS_library:
                        support.error(
                            'Mismatch correction cannot be performed without at least one paired-end library!',
                            log)
                    if not max_IS_library["insert size mean"]:
                        support.warning(
                            'Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction'
                            ' based on the first paired-end library and with default insert size.',
                            log)
                    else:
                        cfg["mismatch_corrector"].__dict__[
                            "insert-size"] = round(
                                max_IS_library["insert size mean"])
                    yaml_dirname = os.path.dirname(
                        options_storage.dataset_yaml_filename)
                    cfg["mismatch_corrector"].__dict__["1"] = list(
                        map(lambda x: os.path.join(yaml_dirname, x),
                            max_IS_library['left reads']))
                    cfg["mismatch_corrector"].__dict__["2"] = list(
                        map(lambda x: os.path.join(yaml_dirname, x),
                            max_IS_library['right reads']))
                    #TODO: add reads orientation

                    import corrector
                    corrector_cfg = cfg["mismatch_corrector"]
                    args = []
                    for key, values in corrector_cfg.__dict__.items():
                        if key == "output-dir":
                            continue

                        # for processing list of reads
                        if not isinstance(values, list):
                            values = [values]
                        for value in values:
                            if len(key) == 1:
                                args.append('-' + key)
                            else:
                                args.append('--' + key)
                            if value is not None:
                                args.append(value)

                    # processing contigs and scaffolds (or only contigs)
                    for assembly_type, (corrected,
                                        assembled) in to_correct.items():
                        if options_storage.continue_mode and os.path.isfile(
                                corrected):
                            log.info("\n== Skipping processing of " +
                                     assembly_type + " (already processed)\n")
                            continue

                        support.continue_from_here(log)
                        log.info("\n== Processing of " + assembly_type + "\n")

                        cur_args = args[:]
                        cur_args += ['-c', assembled]
                        tmp_dir_for_corrector = support.get_tmp_dir(
                            prefix="mis_cor_%s_" % assembly_type)
                        cur_args += ['--output-dir', tmp_dir_for_corrector]

                        # correcting
                        corrector.main(cur_args, ext_python_modules_home, log)

                        result_corrected_filename = os.path.join(
                            tmp_dir_for_corrector, "corrected_contigs.fasta")
                        # moving corrected contigs (scaffolds) to SPAdes output dir
                        if os.path.isfile(result_corrected_filename):
                            shutil.move(result_corrected_filename, corrected)

                        if os.path.isdir(tmp_dir_for_corrector):
                            shutil.rmtree(tmp_dir_for_corrector)

                        assembled_fastg = assembled[:-6] + ".fastg"
                        if os.path.isfile(assembled_fastg):
                            support.create_fastg_from_fasta(
                                corrected, assembled_fastg, log)
                    log.info("\n===== %s finished.\n" % STAGE_NAME)

        if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
            shutil.rmtree(tmp_configs_dir)

        #log.info("")
        if "error_correction" in cfg and os.path.isdir(
                os.path.dirname(corrected_dataset_yaml_filename)):
            log.info(" * Corrected reads are in " + support.process_spaces(
                os.path.dirname(corrected_dataset_yaml_filename) + "/"))
        if "assembly" in cfg and os.path.isfile(result_contigs_filename):
            message = " * Assembled contigs are in " + support.process_spaces(
                result_contigs_filename)
            if os.path.isfile(result_contigs_filename[:-6] + ".fastg"):
                message += " (" + os.path.basename(
                    result_contigs_filename[:-6] + ".fastg") + ")"
            log.info(message)
        if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
            message = " * Assembled scaffolds are in " + support.process_spaces(
                result_scaffolds_filename)
            if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"):
                message += " (" + os.path.basename(
                    result_scaffolds_filename[:-6] + ".fastg") + ")"
            log.info(message)
        #log.info("")

        #breaking scaffolds
        if os.path.isfile(result_scaffolds_filename):
            if not os.path.isdir(misc_dir):
                os.makedirs(misc_dir)
            result_broken_scaffolds = os.path.join(misc_dir,
                                                   "broken_scaffolds.fasta")
            if not os.path.isfile(result_broken_scaffolds
                                  ) or not options_storage.continue_mode:
                modified, broken_scaffolds = support.break_scaffolds(
                    result_scaffolds_filename,
                    options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS)
                if modified:
                    support.write_fasta(result_broken_scaffolds,
                                        broken_scaffolds)
                    #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) +
                    # " Ns are in " + result_broken_scaffolds)

        ### printing WARNINGS SUMMARY
        if not support.log_warnings(log):
            log.info("\n======= SPAdes pipeline finished."
                     )  # otherwise it finished WITH WARNINGS

        if options_storage.test_mode:
            for result_filename in [
                    result_contigs_filename, result_scaffolds_filename
            ]:
                if os.path.isfile(result_filename):
                    result_fasta = list(support.read_fasta(result_filename))
                    # correctness check: should be one contig of length 1000 bp
                    correct_number = 1
                    correct_length = 1000
                    if not len(result_fasta):
                        support.error(
                            "TEST FAILED: %s does not contain contigs!" %
                            result_filename)
                    elif len(result_fasta) > correct_number:
                        support.error(
                            "TEST FAILED: %s contains more than %d contig (%d)!"
                            % (result_filename, correct_number,
                               len(result_fasta)))
                    elif len(result_fasta[0][1]) != correct_length:
                        if len(result_fasta[0][1]) > correct_length:
                            relation = "more"
                        else:
                            relation = "less"
                        support.error(
                            "TEST FAILED: %s contains %s than %d bp (%d bp)!" %
                            (result_filename, relation, correct_length,
                             len(result_fasta[0][1])))
                else:
                    support.error("TEST FAILED: " + result_filename +
                                  " does not exist!")
            log.info("\n========= TEST PASSED CORRECTLY.")

        log.info("\nSPAdes log can be found here: " + log_filename)
        log.info("")
        log.info("Thank you for using SPAdes!")
        log.removeHandler(log_handler)

    except Exception:
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            if exc_type == OSError and exc_value.errno == errno.ENOEXEC:  # Exec format error
                support.error(
                    "It looks like you are using SPAdes binaries for another platform.\n"
                    + support.get_spades_binaries_info_message())
            else:
                log.exception(exc_value)
                support.error("exception caught: %s" % exc_type, log)
    except BaseException:  # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            log.exception(exc_value)
            support.error("exception caught: %s" % exc_type, log)
Esempio n. 43
0
def main():
    os.environ["LC_ALL"] = "C"

    if len(sys.argv) == 1:
        options_storage.usage(spades_version)
        sys.exit(0)

    log = logging.getLogger('spades')
    log.setLevel(logging.DEBUG)

    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    check_binaries(bin_home, log)

    # parse options and safe all parameters to cfg
    cfg, dataset_data = fill_cfg(sys.argv, log)

    if options_storage.continue_mode:
        cmd_line, options = get_options_from_params(
            os.path.join(options_storage.output_dir, "params.txt"))
        if not options:
            support.error(
                "failed to parse command line of the previous run! Please restart from the beginning."
            )
        cfg, dataset_data = fill_cfg(options, log)
        options_storage.continue_mode = True

    log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
    if options_storage.continue_mode:
        log_handler = logging.FileHandler(log_filename, mode='a')
    else:
        log_handler = logging.FileHandler(log_filename, mode='w')
    log.addHandler(log_handler)

    if options_storage.continue_mode:
        log.info(
            "\n======= SPAdes pipeline continued. Log can be found here: " +
            log_filename + "\n")
        log.info("Restored from " + cmd_line)
    else:
        params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
        params_handler = logging.FileHandler(params_filename, mode='w')
        log.addHandler(params_handler)

        command = "Command line:"
        for v in sys.argv:
            command += " " + v
        log.info(command)

        print_used_values(cfg, log)
        log.removeHandler(params_handler)

        log.info("\n======= SPAdes pipeline started. Log can be found here: " +
                 log_filename + "\n")

    # splitting interlaced reads if needed
    if support.dataset_has_interlaced_reads(dataset_data):
        dir_for_split_reads = os.path.join(
            os.path.abspath(options_storage.output_dir), 'split_reads')
        if not os.path.isdir(dir_for_split_reads):
            os.makedirs(dir_for_split_reads)
        dataset_data = support.split_interlaced_reads(dataset_data,
                                                      dir_for_split_reads, log)
        options_storage.dataset_yaml_filename = os.path.join(
            options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data,
                    open(options_storage.dataset_yaml_filename, 'w'))
        cfg["dataset"].yaml_filename = os.path.abspath(
            options_storage.dataset_yaml_filename)

    try:
        # copying configs before all computations (to prevent its changing at run time)
        tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
        if os.path.isdir(
                tmp_configs_dir) and not options_storage.continue_mode:
            shutil.rmtree(tmp_configs_dir)
        if not os.path.isdir(tmp_configs_dir):
            shutil.copytree(os.path.join(spades_home, "configs"),
                            tmp_configs_dir)

        corrected_dataset_yaml_filename = ''
        if "error_correction" in cfg:
            bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
            bh_cfg.__dict__["dataset_yaml_filename"] = cfg[
                "dataset"].yaml_filename
            corrected_dataset_yaml_filename = os.path.join(
                bh_cfg.output_dir, "corrected.yaml")
            if os.path.isfile(corrected_dataset_yaml_filename
                              ) and options_storage.continue_mode:
                log.info(
                    "\n===== Skipping read error correction (already processed). \n"
                )
            else:
                options_storage.continue_mode = False  # continue from here

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in bh_cfg.__dict__:
                    os.environ["HEAPCHECK"] = bh_cfg.heap_check

                if os.path.exists(bh_cfg.output_dir):
                    shutil.rmtree(bh_cfg.output_dir)

                os.makedirs(bh_cfg.output_dir)
                if not os.path.exists(bh_cfg.tmp_dir):
                    os.makedirs(bh_cfg.tmp_dir)

                log.info("\n===== Read error correction started. \n")
                bh_logic.run_bh(corrected_dataset_yaml_filename,
                                tmp_configs_dir, bin_home, bh_cfg,
                                ext_python_modules_home, log)
                log.info("\n===== Read error correction finished. \n")

        result_contigs_filename = os.path.join(cfg["common"].output_dir,
                                               "contigs.fasta")
        result_scaffolds_filename = os.path.join(cfg["common"].output_dir,
                                                 "scaffolds.fasta")
        misc_dir = os.path.join(cfg["common"].output_dir, "misc")
        ### if mismatch correction is enabled then result contigs are copied to misc directory
        assembled_contigs_filename = os.path.join(misc_dir,
                                                  "assembled_contigs.fasta")
        assembled_scaffolds_filename = os.path.join(
            misc_dir, "assembled_scaffolds.fasta")
        if "assembly" in cfg:
            spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
            spades_cfg.__dict__["result_contigs"] = result_contigs_filename
            spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
            spades_cfg.__dict__["additional_contigs"] = os.path.join(
                spades_cfg.output_dir, "simplified_contigs.fasta")

            if options_storage.continue_mode and (
                    os.path.isfile(spades_cfg.result_contigs) or
                ("mismatch_corrector" in cfg
                 and os.path.isfile(assembled_contigs_filename))):
                log.info("\n===== Skipping assembling (already processed). \n")
                # calculating latest_dir for the next stages
                latest_dir = support.get_latest_dir(
                    os.path.join(spades_cfg.output_dir, "K*"))
                if not latest_dir:
                    support.error(
                        "failed to continue the previous run! Please restart from the beginning."
                    )
            else:
                if os.path.isfile(corrected_dataset_yaml_filename):
                    dataset_data = pyyaml.load(
                        open(corrected_dataset_yaml_filename, 'r'))
                    dataset_data = support.relative2abs_paths(
                        dataset_data,
                        os.path.dirname(corrected_dataset_yaml_filename))
                if support.dataset_has_paired_reads(dataset_data):
                    spades_cfg.__dict__["paired_mode"] = True
                else:
                    spades_cfg.__dict__["paired_mode"] = False

                if options_storage.rectangles:
                    spades_cfg.__dict__["resolving_mode"] = "rectangles"

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in spades_cfg.__dict__:
                    os.environ["HEAPCHECK"] = spades_cfg.heap_check

                log.info("\n===== Assembling started.\n")

                # creating dataset
                dataset_filename = os.path.join(spades_cfg.output_dir,
                                                "dataset.info")
                if not os.path.isfile(
                        dataset_filename) or not options_storage.continue_mode:
                    dataset_file = open(dataset_filename, 'w')
                    import process_cfg
                    dataset_file.write(
                        "single_cell" + '\t' +
                        process_cfg.bool_to_str(cfg["dataset"].single_cell) +
                        '\n')
                    if os.path.isfile(corrected_dataset_yaml_filename):
                        dataset_file.write(
                            "reads" + '\t' + process_cfg.process_spaces(
                                corrected_dataset_yaml_filename) + '\n')
                    else:
                        dataset_file.write("reads" + '\t' +
                                           process_cfg.process_spaces(
                                               cfg["dataset"].yaml_filename) +
                                           '\n')
                    if spades_cfg.developer_mode and "reference" in cfg[
                            "dataset"].__dict__:
                        dataset_file.write("reference_genome" + '\t')
                        dataset_file.write(
                            process_cfg.process_spaces(
                                os.path.abspath(cfg["dataset"].reference)) +
                            '\n')
                    dataset_file.close()
                spades_cfg.__dict__["dataset"] = dataset_filename

                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home,
                                                     spades_cfg, log)

                #rectangles
                if spades_cfg.paired_mode and options_storage.rectangles:
                    if options_storage.continue_mode:  # TODO: continue mode
                        support.warning(
                            "sorry, --continue doesn't work with --rectangles yet. Skipping repeat resolving."
                        )
                    else:
                        sys.path.append(
                            os.path.join(python_modules_home, "rectangles"))
                        import rrr

                        rrr_input_dir = os.path.join(latest_dir, "saves")
                        rrr_outpath = os.path.join(spades_cfg.output_dir,
                                                   "rectangles")
                        if not os.path.exists(rrr_outpath):
                            os.mkdir(rrr_outpath)

                        rrr_reference_information_file = os.path.join(
                            rrr_input_dir,
                            "late_pair_info_counted_etalon_distance.txt")
                        rrr_test_util = rrr.TestUtils(
                            rrr_reference_information_file,
                            os.path.join(rrr_outpath, "rectangles.log"))
                        rrr.resolve(rrr_input_dir, rrr_outpath, rrr_test_util,
                                    "", cfg["dataset"].single_cell,
                                    spades_cfg.careful)

                        shutil.copyfile(
                            os.path.join(
                                rrr_outpath,
                                "rectangles_extend_before_scaffold.fasta"),
                            spades_cfg.result_contigs)
                        shutil.copyfile(
                            os.path.join(rrr_outpath,
                                         "rectangles_extend.fasta"),
                            spades_cfg.result_scaffolds)

                        if not spades_cfg.developer_mode:
                            if os.path.exists(rrr_input_dir):
                                shutil.rmtree(rrr_input_dir)
                            if os.path.exists(rrr_outpath):
                                shutil.rmtree(rrr_outpath, True)
                            if os.path.exists(rrr_outpath):
                                os.system('rm -r ' + rrr_outpath)
                                #EOR

                if os.path.isdir(
                        misc_dir) and not options_storage.continue_mode:
                    shutil.rmtree(misc_dir)
                if not os.path.isdir(misc_dir):
                    os.makedirs(misc_dir)
                    if os.path.isfile(spades_cfg.additional_contigs):
                        shutil.move(spades_cfg.additional_contigs, misc_dir)

                log.info("\n===== Assembling finished. \n")

            #corrector
            if "mismatch_corrector" in cfg and (
                    os.path.isfile(result_contigs_filename) or
                (options_storage.continue_mode
                 and os.path.isfile(assembled_contigs_filename))):
                to_correct = dict()
                to_correct["contigs"] = (result_contigs_filename,
                                         assembled_contigs_filename)
                if os.path.isfile(result_scaffolds_filename) or (
                        options_storage.continue_mode
                        and os.path.isfile(assembled_scaffolds_filename)):
                    to_correct["scaffolds"] = (result_scaffolds_filename,
                                               assembled_scaffolds_filename)

                # moving assembled contigs (scaffolds) to misc dir
                for k, (old, new) in to_correct.items():
                    if options_storage.continue_mode and os.path.isfile(new):
                        continue
                    shutil.move(old, new)

                if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
                    (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)):
                    log.info(
                        "\n===== Skipping mismatch correction (already processed). \n"
                    )
                else:
                    log.info("\n===== Mismatch correction started.")

                    # detecting paired-end library with the largest insert size
                    dataset_data = pyyaml.load(
                        open(options_storage.dataset_yaml_filename, 'r')
                    )  ### initial dataset, i.e. before error correction
                    dataset_data = support.relative2abs_paths(
                        dataset_data,
                        os.path.dirname(options_storage.dataset_yaml_filename))
                    paired_end_libraries_ids = []
                    for id, reads_library in enumerate(dataset_data):
                        if reads_library['type'] == 'paired-end':
                            paired_end_libraries_ids.append(id)
                    if not len(paired_end_libraries_ids):
                        support.error(
                            'Mismatch correction cannot be performed without at least one paired-end library!'
                        )
                    estimated_params = load_config_from_file(
                        os.path.join(latest_dir, "_est_params.info"))
                    max_insert_size = -1
                    target_paired_end_library_id = -1
                    for id in paired_end_libraries_ids:
                        if float(estimated_params.__dict__[
                                "insert_size_" + str(id)]) > max_insert_size:
                            max_insert_size = float(
                                estimated_params.__dict__["insert_size_" +
                                                          str(id)])
                            target_paired_end_library_id = id
                    yaml_dirname = os.path.dirname(
                        options_storage.dataset_yaml_filename)
                    cfg["mismatch_corrector"].__dict__["1"] = list(
                        map(
                            lambda x: os.path.join(yaml_dirname, x),
                            dataset_data[target_paired_end_library_id]
                            ['left reads']))
                    cfg["mismatch_corrector"].__dict__["2"] = list(
                        map(
                            lambda x: os.path.join(yaml_dirname, x),
                            dataset_data[target_paired_end_library_id]
                            ['right reads']))
                    cfg["mismatch_corrector"].__dict__["insert-size"] = round(
                        max_insert_size)
                    #TODO: add reads orientation

                    import corrector
                    corrector_cfg = cfg["mismatch_corrector"]
                    args = []
                    for key, values in corrector_cfg.__dict__.items():
                        if key == "output-dir":
                            continue

                        # for processing list of reads
                        if not isinstance(values, list):
                            values = [values]
                        for value in values:
                            if len(key) == 1:
                                args.append('-' + key)
                            else:
                                args.append('--' + key)
                            if value:
                                args.append(value)

                    # processing contigs and scaffolds (or only contigs)
                    for k, (corrected, assembled) in to_correct.items():
                        if options_storage.continue_mode and os.path.isfile(
                                corrected):
                            log.info("\n== Skipping processing of " + k +
                                     " (already processed)\n")
                            continue

                        options_storage.continue_mode = False
                        log.info("\n== Processing of " + k + "\n")

                        cur_args = args[:]
                        cur_args += ['-c', assembled]
                        tmp_dir_for_corrector = os.path.join(
                            corrector_cfg.__dict__["output-dir"],
                            "mismatch_corrector_" + k)
                        cur_args += ['--output-dir', tmp_dir_for_corrector]

                        # correcting
                        corrector.main(cur_args, ext_python_modules_home, log)

                        result_corrected_filename = os.path.abspath(
                            os.path.join(tmp_dir_for_corrector,
                                         "corrected_contigs.fasta"))
                        # moving corrected contigs (scaffolds) to SPAdes output dir
                        if os.path.isfile(result_corrected_filename):
                            shutil.move(result_corrected_filename, corrected)

                        if os.path.isdir(tmp_dir_for_corrector):
                            shutil.rmtree(tmp_dir_for_corrector)

                    log.info("\n===== Mismatch correction finished.\n")

        if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
            shutil.rmtree(tmp_configs_dir)

        #log.info("")
        if os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)):
            log.info(" * Corrected reads are in " +
                     os.path.dirname(corrected_dataset_yaml_filename) + "/")
        if os.path.isfile(result_contigs_filename):
            log.info(" * Assembled contigs are in " + result_contigs_filename)
        if os.path.isfile(result_scaffolds_filename):
            log.info(" * Assembled scaffolds are in " +
                     result_scaffolds_filename)
        #log.info("")

        #breaking scaffolds
        if os.path.isfile(result_scaffolds_filename):
            if not os.path.isdir(misc_dir):
                os.makedirs(misc_dir)
            result_broken_scaffolds = os.path.join(misc_dir,
                                                   "broken_scaffolds.fasta")
            threshold = 3
            if not os.path.isfile(result_broken_scaffolds
                                  ) or not options_storage.continue_mode:
                support.break_scaffolds(result_scaffolds_filename, threshold,
                                        result_broken_scaffolds)
                #log.info(" * Scaffolds broken by " + str(threshold) + " Ns are in " + result_broken_scaffolds)

        ### printing WARNINGS SUMMARY
        if not support.log_warnings(log):
            log.info("\n======= SPAdes pipeline finished."
                     )  # otherwise it finished WITH WARNINGS

        log.info("\nSPAdes log can be found here: " + log_filename)
        log.info("")
        log.info("Thank you for using SPAdes!")
        log.removeHandler(log_handler)

    except Exception:
        _, exc, _ = sys.exc_info()
        log.exception(exc)
        support.error("exception caught", log)
Esempio n. 44
0
    def check(self):
        '''
        Check correctness of parameters. Unless throws error.
        '''
        if self.multinom is None:
            if self.model_func_file is None:
                self.multinom = False
            else:
                self.multinom = True

        if self.pop_labels is not None:
            self.pop_labels = [x.strip() for x in self.pop_labels.split(',')]
        if self.ns is not None:
            self.ns = support.check_comma_sep_list(self.ns)

        self.input_file = support.check_file_existence(self.input_file)
        
        if self.resume_dir is not None:
            self.resume_dir = support.check_dir_existence(self.resume_dir)
        if self.resume_dir is not None and self.output_dir is None:
            self.output_dir = support.ensure_dir_existence(
                self.resume_dir + "_resumed", check_emptiness=True)
        elif self.output_dir is None:
            support.error("Parameter `Output directory` is required")
        else:
            self.output_dir = support.ensure_dir_existence(
                self.output_dir, check_emptiness=True)

        if self.input_file is None:
            support.error(
                "Parameter `Input file` is required")
        if self.theta is None:
            support.warning(
                "`Theta0` is not specified. It would be 1.0.")
        if self.gen_time is None:
            support.warning(
                "`Time for one generation` is not specified. Time will be in genetic units.")

        self.input_data, self.ns, self.pop_labels = support.load_spectrum(
                self.input_file, self.ns, self.pop_labels)
        self.ns = np.array(self.ns)
        self.number_of_populations = len(self.ns)

        # Linked or unlinked data
        if not self.linked_snp and self.boot_dir is not None:
            support.warning(
                    "SNP's are marked as unlinked, so the directory with bootstrap will be ignored.")
        elif self.linked_snp:
            if self.boot_dir is not None:
                self.boot_dir = support.check_dir_existence(self.boot_dir)
                self.boots = gadma.Inference.load_bootstrap_data_from_dir(self.boot_dir, self.ns, self.pop_labels)

        # Custom model
        if self.model_func_file is not None:
            self.model_func_file = support.check_file_existence(self.model_func_file)
            file_with_model_func = imp.load_source('module', self.model_func_file)
            try:
                self.model_func = file_with_model_func.model_func  
            except:
                support.error(
                    "File " + self.model_func_file + ' does not contain function named `model_func`.')

        
        if self.model_func_file is not None:
            if self.p_ids is not None:
                self.p_ids = support.check_comma_sep_list(self.p_ids, is_int=False)
                
        self.fracs = [float(x) for x in self.fracs.split(",")]
        if len(self.fracs) != 3:
            support.error(
                "length of `Fractions` (Parameters of genetic algorithm) must be 3")
        self.frac_of_old_models = self.fracs[0]
        self.frac_of_mutated_models = self.fracs[1]
        self.frac_of_crossed_models = self.fracs[2]

        if self.moments_scenario and self.dadi_pts is not None:
            support.warning(
                "Moments doesn't use --pts argument, so it would be ignored")
        if self.dadi_pts is None:
            max_n = max(self.ns)
            self.dadi_pts = [max_n, max_n + 10, max_n + 20]
        else:
            self.dadi_pts = support.check_comma_sep_list(self.dadi_pts)

        self.put_default_structures()

        self.final_check()
Esempio n. 45
0
    def final_check(self):
        if self.model_func_file is not None and self.model_func_file is None:
            if self.p_ids is None and (self.lower_bound is None or self.upper_bound is None):
                support.error(
                        "Either parameter identifiers or lower and upper bounds should be specified.")

        if self.model_func_file is not None and self.initial_structure is not None:
            support.warning(
                    "Both structure and custom model are specified. Custom model will be optimized, structure will be ignored.")
        if self.model_func_file is not None and self.only_sudden:
            support.warning(
                    "Both custom model and `Only sudden: True` are specified. `Only sudden` will be ignored.")

        if (self.frac_of_old_models +
                self.frac_of_crossed_models +
                self.frac_of_mutated_models) > 1:
            support.error(
                "Sum of Fractions (Parameters of genetic algorithm) must be less than or equal to 1")
        if (self.frac_of_old_models +
                self.frac_of_crossed_models +
                self.frac_of_mutated_models) == 1:
            support.warning("Faction of random models is 0")


        # check lengths of bounds and p_ids
        if self.model_func_file is not None:
            if len(self.lower_bound) != len(self.upper_bound):
                support.error(
                        "Lengths of lower and upper bounds should be equal.")
            if self.p_ids is not None:
                if len(self.p_ids) != len(self.lower_bound):
                    print self.p_ids
                    print self.lower_bound
                    support.error(
                        "Lengths of lower, upper bounds and parameters identificators should be equal.")


        if self.initial_structure is not None:
            if len(self.initial_structure
                   ) != self.number_of_populations:
                support.error("wrong length of initial model structure: must be " +
                              str(self.number_of_populations))
            for n in self.initial_structure:
                if n < 0:
                    support.error('elements in comma-separated list ' + ','.join(
                        str(x) for x in self.initial_structure) +
                        ' must be positive (`Initial structure` parameter)')
        if self.final_structure is not None:
            if len(self.final_structure
                   ) != self.number_of_populations:
                support.error("Wrong length of final model structure: must be " +
                              str(self.number_of_populations))
            for n in self.final_structure:
                if n < 0:
                    support.error('Elements in comma-separated list ' + ','.join(
                        self.final_structure) +
                        ' must be positive (`Final structure` parameter)')
            if not (self.final_structure >=
                    self.initial_structure).all():
                support.error(
                    "Final structure must be greater than initial structure")
        if self.split_1_lim is not None and self.split_2_lim is not None and not self.split_1_lim > self.split_2_lim:
            support.error(
                "Upper bound of first split must be greater than upper bound of second split")
        if self.size_of_generation <= 0:
            support.error(
                "Size of population (Parameters of genetic algorithm) must be positive"
            )
        if self.mutation_strength > 1 or self.mutation_strength < 0:
            support.error(
                "Mutation strength (Paramters of genetic algorithm) must be between 0 and 1"
            )
        if self.mutation_rate > 1 or self.mutation_rate < 0:
            support.error(
                "Mutation rate (Parameters of genetic algorithm) must be between 0 and 1"
            )
        if self.const_for_mut_rate < 1 or self.const_for_mut_rate > 2:
            support.error(
                "Const for adaptive mutation rate (Parameters of genetic algorithm) must be between 1 and 2"
            )
        if self.const_for_mut_strength < 1 or self.const_for_mut_rate > 2:
            support.error(
                "Const for adaptive mutation strength (Parameters of genetic algorithm) must be between 1 and 2"
            )
        if self.dadi_pts is not None:
            for n in self.dadi_pts:
                if n < 0:
                    support.error('elements in comma-separated list ' +
                                  ','.join(str(x) for x in self.dadi_pts) +
                                  ' must be positive (Pts parameter)')
        if self.repeats <= 0:
            support.error("Repeats (Parameters of pipeline) must be positive")
        if self.processes <= 0:
            support.error(
                "Processes (Parameters of pipeline) must be positive")

        if self.number_of_populations < 3 and self.split_2_lim is not None:
            support.warning("There is no second split in case of " +
                            str(self.number_of_populations) +
                            " populations. Upper bound for it will be ignored.")
            self.split_2_lim = None
        if self.number_of_populations < 2 and self.split_1_lim is not None:
            support.warning(
                "There is no first split in case of 1 populations. Upper bound for it will be ignored.")
            self.split_1_lim = None

        if self.moments_scenario:
            if pkgutil.find_loader('moments') is None:
                if self.model_func_file is not None:
                    support.error("moments is not installed. You tried to use custom model and moments.")
                if pkgutil.find_loader('dadi') is not None:
                    options_storage.moments_scenario = False
                    support.warning("moments is not installed, dadi with " + str(self.dadi_pts) +"grid size will be used instead.")
                else:
                    support.error("None of the dadi or the moments are installed.")
        else:
            if pkgutil.find_loader('dadi') is None:
                if self.model_func_file is not None:
                    support.error("dadi is not installed. You tried to use custom model and moments.")
                if pkgutil.find_loader('moments') is not None:
                    options_storage.moments_scenario = True
                    support.warning("dadi is not installed, moments will be used instead.")
                else:
                    support.error("None of the dadi or the moments are installed.")

        packages = []
        self.matplotlib_available = pkgutil.find_loader('matplotlib') is not None
        if not self.matplotlib_available:
            packages.append('matplotlib')
        
        # If custom model and dadi is used we can ignore PIL absence
        if self.model_func_file is None or self.moments_scenario:
            self.pil_available = pkgutil.find_loader('PIL') is not None
            if not self.pil_available:
                packages.append('Pillow')
        
        self.moments_available = pkgutil.find_loader('moments') is not None
        if not self.moments_available:
            packages.append('moments')
            
        if not self.matplotlib_available:
            support.warning(
                "To draw models and SFS plots you should install: " +
                ', '.join(packages))
        elif not self.pil_available and self.moments_available:
            support.warning(
                "To draw concatenated plots you should install: Pillow")
        elif not self.moments_available:
            support.warning(
                "To draw models plots you should install: " +
                ', '.join(packages))

        if self.optimize_name == 'optimize_powell' and not self.moments_scenario:
            if not self.moments_available:
                support.warning(
                    "To use Powell optimization one need moments installed. BFGS (optimize_log) will be used instead.")
                self.optimize_name = 'optimize_log'


        if self.distribution != 'normal' and self.distribution != 'uniform':
            support.error(
                "Distribution in extra parameters must be `normal` or `uniform`.")
        if self.distribution == 'uniform' and self.std is not None:
            support.warning(
                'Std in extra parameters will be ignored as uniform distribution was chosen.')
Esempio n. 46
0
def run_spades(configs_dir, execution_home, cfg, dataset_data,
               ext_python_modules_home, log):
    if not isinstance(cfg.iterative_K, list):
        cfg.iterative_K = [cfg.iterative_K]
    cfg.iterative_K = sorted(cfg.iterative_K)
    used_K = []

    # checking and removing conflicting K-mer directories
    if options_storage.restart_from and (options_storage.restart_k_mers !=
                                         options_storage.original_k_mers):
        processed_K = []
        for k in range(options_storage.MIN_K, options_storage.MAX_K, 2):
            cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k)
            if os.path.isdir(cur_K_dir) and os.path.isfile(
                    os.path.join(cur_K_dir, "final_contigs.fasta")):
                processed_K.append(k)
        if processed_K:
            RL = get_read_length(cfg.output_dir, processed_K[0],
                                 ext_python_modules_home, log)
            needed_K = update_k_mers_in_special_cases(cfg.iterative_K,
                                                      RL,
                                                      log,
                                                      silent=True)
            needed_K = [k for k in needed_K if k < RL]
            original_K = reveal_original_k_mers(RL)

            k_to_delete = []
            for id, k in enumerate(needed_K):
                if len(processed_K) == id:
                    if processed_K[-1] == original_K[
                            -1]:  # the last K in the original run was processed in "last_one" mode
                        k_to_delete = [original_K[-1]]
                    break
                if processed_K[id] != k:
                    k_to_delete = processed_K[id:]
                    break
            if not k_to_delete and (len(processed_K) > len(needed_K)):
                k_to_delete = processed_K[len(needed_K) - 1:]
            if k_to_delete:
                log.info(
                    "Restart mode: removing previously processed directories for K=%s "
                    "to avoid conflicts with K specified with --restart-from" %
                    (str(k_to_delete)))
                for k in k_to_delete:
                    shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k))

    bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads")
    if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode:
        shutil.rmtree(bin_reads_dir)
    cfg.tmp_dir = support.get_tmp_dir(prefix="spades_")

    finished_on_stop_after = False
    K = cfg.iterative_K[0]
    if len(cfg.iterative_K) == 1:
        run_iteration(configs_dir, execution_home, cfg, log, K, None, True)
        used_K.append(K)
    else:
        run_iteration(configs_dir, execution_home, cfg, log, K, None, False)
        used_K.append(K)
        if options_storage.stop_after == "k%d" % K:
            finished_on_stop_after = True
        else:
            prev_K = K
            RL = get_read_length(cfg.output_dir, K, ext_python_modules_home,
                                 log)
            cfg.iterative_K = update_k_mers_in_special_cases(
                cfg.iterative_K, RL, log)
            if len(cfg.iterative_K) < 2 or cfg.iterative_K[1] + 1 > RL:
                if cfg.rr_enable:
                    if len(cfg.iterative_K) < 2:
                        log.info(
                            "== Rerunning for the first value of K (%d) with Repeat Resolving"
                            % cfg.iterative_K[0])
                    else:
                        support.warning(
                            "Second value of iterative K (%d) exceeded estimated read length (%d). "
                            "Rerunning for the first value of K (%d) with Repeat Resolving"
                            % (cfg.iterative_K[1], RL, cfg.iterative_K[0]),
                            log)
                    run_iteration(configs_dir, execution_home, cfg, log,
                                  cfg.iterative_K[0], None, True)
                    used_K.append(cfg.iterative_K[0])
                    K = cfg.iterative_K[0]
            else:
                rest_of_iterative_K = cfg.iterative_K
                rest_of_iterative_K.pop(0)
                count = 0
                for K in rest_of_iterative_K:
                    count += 1
                    last_one = count == len(cfg.iterative_K) or (
                        rest_of_iterative_K[count] + 1 > RL)
                    run_iteration(configs_dir, execution_home, cfg, log, K,
                                  prev_K, last_one)
                    used_K.append(K)
                    prev_K = K
                    if last_one:
                        break
                    if options_storage.stop_after == "k%d" % K:
                        finished_on_stop_after = True
                        break
                if count < len(cfg.iterative_K) and not finished_on_stop_after:
                    support.warning(
                        "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)"
                        % (cfg.iterative_K[count], RL), log)

    if options_storage.stop_after and options_storage.stop_after.startswith(
            'k'):
        support.finish_here(log)
    latest = os.path.join(cfg.output_dir, "K%d" % K)

    if cfg.correct_scaffolds and not options_storage.run_completed:
        if options_storage.continue_mode and os.path.isfile(
                os.path.join(cfg.output_dir, "SCC", "corrected_scaffolds.fasta"
                             )) and not options_storage.restart_from == "scc":
            log.info("\n===== Skipping %s (already processed). \n" %
                     "scaffold correction")
        else:
            if options_storage.continue_mode:
                support.continue_from_here(log)
            run_scaffold_correction(configs_dir, execution_home, cfg, log,
                                    latest, 21)
        latest = os.path.join(os.path.join(cfg.output_dir, "SCC"), "K21")
        if options_storage.stop_after == 'scc':
            support.finish_here(log)

    if cfg.correct_scaffolds:
        correct_scaffolds_fpath = os.path.join(latest,
                                               "corrected_scaffolds.fasta")
        if os.path.isfile(correct_scaffolds_fpath):
            shutil.copyfile(correct_scaffolds_fpath, cfg.result_scaffolds)
    elif not finished_on_stop_after:  # interupted by --stop-after, so final K is not processed!
        if os.path.isfile(os.path.join(latest, "before_rr.fasta")):
            result_before_rr_contigs = os.path.join(
                os.path.dirname(cfg.result_contigs), "before_rr.fasta")
            if not os.path.isfile(result_before_rr_contigs
                                  ) or not options_storage.continue_mode:
                shutil.copyfile(os.path.join(latest, "before_rr.fasta"),
                                result_before_rr_contigs)
        if options_storage.rna:
            if os.path.isfile(os.path.join(latest, "transcripts.fasta")):
                if not os.path.isfile(cfg.result_transcripts
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(os.path.join(latest, "transcripts.fasta"),
                                    cfg.result_transcripts)
            if os.path.isfile(os.path.join(latest, "transcripts.paths")):
                if not os.path.isfile(cfg.result_transcripts_paths
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(os.path.join(latest, "transcripts.paths"),
                                    cfg.result_transcripts_paths)
            for filtering_type in options_storage.filtering_types:
                prefix = filtering_type + "_filtered_"
                result_filtered_transcripts = os.path.join(
                    cfg.output_dir, prefix + options_storage.transcripts_name)
                latest_filtered_transcripts = os.path.join(
                    latest, prefix + "final_paths.fasta")
                if os.path.isfile(latest_filtered_transcripts):
                    if not os.path.isfile(
                            result_filtered_transcripts
                    ) or not options_storage.continue_mode:
                        shutil.copyfile(latest_filtered_transcripts,
                                        result_filtered_transcripts)
        else:
            if os.path.isfile(os.path.join(latest, "final_contigs.fasta")):
                if not os.path.isfile(cfg.result_contigs
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(
                        os.path.join(latest, "final_contigs.fasta"),
                        cfg.result_contigs)
            if os.path.isfile(os.path.join(latest, "first_pe_contigs.fasta")):
                result_first_pe_contigs = os.path.join(
                    os.path.dirname(cfg.result_contigs),
                    "first_pe_contigs.fasta")
                if not os.path.isfile(result_first_pe_contigs
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(
                        os.path.join(latest, "first_pe_contigs.fasta"),
                        result_first_pe_contigs)
            if cfg.rr_enable:
                if os.path.isfile(os.path.join(latest, "scaffolds.fasta")):
                    if not os.path.isfile(
                            cfg.result_scaffolds
                    ) or not options_storage.continue_mode:
                        shutil.copyfile(
                            os.path.join(latest, "scaffolds.fasta"),
                            cfg.result_scaffolds)
                if os.path.isfile(os.path.join(latest, "scaffolds.paths")):
                    if not os.path.isfile(
                            cfg.result_scaffolds_paths
                    ) or not options_storage.continue_mode:
                        shutil.copyfile(
                            os.path.join(latest, "scaffolds.paths"),
                            cfg.result_scaffolds_paths)
            if os.path.isfile(
                    os.path.join(latest, "assembly_graph_with_scaffolds.gfa")):
                if not os.path.isfile(cfg.result_graph_gfa
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(
                        os.path.join(latest,
                                     "assembly_graph_with_scaffolds.gfa"),
                        cfg.result_graph_gfa)
            if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")):
                if not os.path.isfile(
                        cfg.result_graph) or not options_storage.continue_mode:
                    shutil.copyfile(
                        os.path.join(latest, "assembly_graph.fastg"),
                        cfg.result_graph)
            if os.path.isfile(os.path.join(latest, "final_contigs.paths")):
                if not os.path.isfile(cfg.result_contigs_paths
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(
                        os.path.join(latest, "final_contigs.paths"),
                        cfg.result_contigs_paths)

    if cfg.developer_mode:
        # saves
        saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves")
        if os.path.lexists(
                saves_link
        ):  # exists returns False for broken links! lexists return True
            os.remove(saves_link)
        os.symlink(os.path.join(latest, "saves"), saves_link)

    if os.path.isdir(bin_reads_dir):
        shutil.rmtree(bin_reads_dir)
    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)

    return used_K
Esempio n. 47
0
def run_spades(configs_dir, execution_home, cfg, dataset_data,
               ext_python_modules_home, log):
    if not isinstance(cfg.iterative_K, list):
        cfg.iterative_K = [cfg.iterative_K]
    cfg.iterative_K = sorted(cfg.iterative_K)

    # checking and removing conflicting K-mer directories
    if options_storage.restart_from:
        processed_K = []
        for k in range(options_storage.MIN_K, options_storage.MAX_K, 2):
            cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k)
            if os.path.isdir(cur_K_dir) and os.path.isfile(
                    os.path.join(cur_K_dir, "final_contigs.fasta")):
                processed_K.append(k)
        if processed_K:
            RL = get_read_length(cfg.output_dir, processed_K[0],
                                 ext_python_modules_home, log)
            needed_K = update_k_mers_in_special_cases(cfg.iterative_K,
                                                      RL,
                                                      log,
                                                      silent=True)
            needed_K = [k for k in needed_K if k < RL]
            original_K = reveal_original_k_mers(RL)

            k_to_delete = []
            for id, k in enumerate(needed_K):
                if len(processed_K) == id:
                    if processed_K[-1] == original_K[
                            -1]:  # the last K in the original run was processed in "last_one" mode
                        k_to_delete = [original_K[-1]]
                    break
                if processed_K[id] != k:
                    k_to_delete = processed_K[id:]
                    break
            if not k_to_delete and (len(processed_K) > len(needed_K)):
                k_to_delete = processed_K[len(needed_K) - 1:]
            if k_to_delete:
                log.info(
                    "Restart mode: removing previously processed directories for K=%s "
                    "to avoid conflicts with K specified with --restart-from" %
                    (str(k_to_delete)))
                for k in k_to_delete:
                    shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k))

    bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads")
    if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode:
        shutil.rmtree(bin_reads_dir)
    cfg.tmp_dir = support.get_tmp_dir(prefix="spades_")

    if len(cfg.iterative_K) == 1:
        run_iteration(configs_dir, execution_home, cfg, log,
                      cfg.iterative_K[0], None, True)
        K = cfg.iterative_K[0]
    else:
        run_iteration(configs_dir, execution_home, cfg, log,
                      cfg.iterative_K[0], None, False)
        prev_K = cfg.iterative_K[0]
        RL = get_read_length(cfg.output_dir, cfg.iterative_K[0],
                             ext_python_modules_home, log)
        cfg.iterative_K = update_k_mers_in_special_cases(
            cfg.iterative_K, RL, log)
        if cfg.iterative_K[1] + 1 > RL:
            if cfg.rr_enable:
                support.warning(
                    "Second value of iterative K (%d) exceeded estimated read length (%d). "
                    "Rerunning for the first value of K (%d) with Repeat Resolving"
                    % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log)
                run_iteration(configs_dir, execution_home, cfg, log,
                              cfg.iterative_K[0], None, True)
                K = cfg.iterative_K[0]
        else:
            rest_of_iterative_K = cfg.iterative_K
            rest_of_iterative_K.pop(0)
            count = 0
            for K in rest_of_iterative_K:
                count += 1
                last_one = count == len(
                    cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL)
                run_iteration(configs_dir, execution_home, cfg, log, K, prev_K,
                              last_one)
                prev_K = K
                if last_one:
                    break
            if count < len(cfg.iterative_K):
                support.warning(
                    "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)"
                    % (cfg.iterative_K[count], RL), log)

    latest = os.path.join(cfg.output_dir, "K%d" % K)

    for format in [".fasta", ".fastg"]:
        if os.path.isfile(os.path.join(latest, "before_rr" + format)):
            result_before_rr_contigs = os.path.join(
                os.path.dirname(cfg.result_contigs), "before_rr" + format)
            if not os.path.isfile(result_before_rr_contigs
                                  ) or not options_storage.continue_mode:
                shutil.copyfile(os.path.join(latest, "before_rr" + format),
                                result_before_rr_contigs)
        if os.path.isfile(os.path.join(latest, "final_contigs" + format)):
            if not os.path.isfile(cfg.result_contigs[:-6] +
                                  format) or not options_storage.continue_mode:
                shutil.copyfile(os.path.join(latest, "final_contigs" + format),
                                cfg.result_contigs[:-6] + format)
        if cfg.rr_enable:
            if os.path.isfile(os.path.join(latest, "scaffolds" + format)):
                if not os.path.isfile(cfg.result_scaffolds[:-6] + format
                                      ) or not options_storage.continue_mode:
                    shutil.copyfile(os.path.join(latest, "scaffolds" + format),
                                    cfg.result_scaffolds[:-6] + format)

    if cfg.developer_mode:
        # saves
        saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves")
        if os.path.lexists(
                saves_link
        ):  # exists return False for broken link! lexists return True
            os.remove(saves_link)
        os.symlink(os.path.join(latest, "saves"), saves_link)

    if os.path.isdir(bin_reads_dir):
        shutil.rmtree(bin_reads_dir)
    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)

    return latest