Ejemplo n.º 1
0
def split_fasta_file(input_file_path,
                     dest_dir,
                     prefix='part',
                     number_of_sequences_per_file=20000):
    debug('%s; src: %s, dest dir: %s' % (my_name(), input_file_path, dest_dir))

    input = SequenceSource(input_file_path)

    parts = []
    next_part = 1
    part_obj = None

    while input.next():
        if (input.pos - 1) % number_of_sequences_per_file == 0:
            sys.stderr.write('\rCreating part: ~ %s' % (pp(next_part)))
            sys.stderr.flush()

            if part_obj:
                part_obj.close()
            file_path = os.path.join(dest_dir, prefix + '-%08d' % next_part)
            parts.append(file_path)
            next_part += 1
            part_obj = open(file_path, 'w')

        part_obj.write('>%s\n' % input.id)
        part_obj.write('%s\n' % input.seq)

    if part_obj:
        part_obj.close()

    sys.stderr.write('\n')
    return parts
Ejemplo n.º 2
0
def copy_file(source_file, dest_file):
    debug('%s; dest: "%s", src: "%s"' % (my_name(), source_file, dest_file))
    try:
        return shutil.copyfile(source_file, dest_file)
    except IOError, e:
        raise UtilsError, "copy failed due to the following reason: '%s' (src: %s, dst: %s)" \
                                        % (e, source_file, dest_file)
Ejemplo n.º 3
0
def store_ids_from_b6_output(source_b6_output, dest_file):
    debug('%s; dest: %s' % (my_name(), dest_file))
    try:
        b6 = B6Source(source_b6_output)
    except IOError, e:
        raise UtilsError, "open failed due to the following reason: '%s' (src: %s)" \
                                        % (e, source_b6_output)
Ejemplo n.º 4
0
def store_ids_from_b6_output(source_b6_output, dest_file):
    debug('%s; dest: %s' % (my_name(), dest_file))
    try:
        b6 = B6Source(source_b6_output)
    except IOError, e:
        raise UtilsError, "open failed due to the following reason: '%s' (src: %s)" \
                                        % (e, source_b6_output)
Ejemplo n.º 5
0
def run_command(cmdline):
       debug('%s; cmd: %s' % (my_name(), cmdline))
       try:
           if subprocess.call(cmdline, shell = True) < 0:
               raise UtilsError, "command was terminated by signal: %d" % (-retcode)
       except OSError, e:
           raise UtilsError, "command was failed for the following reason: '%s' ('%s')" % (e, cmdline)   
Ejemplo n.º 6
0
def split_fasta_file(input_file_path, dest_dir, prefix = 'part', number_of_sequences_per_file = 20000):
    debug('%s; src: %s, dest dir: %s' % (my_name(), input_file_path, dest_dir))
    
    input = SequenceSource(input_file_path)
    
    parts = []
    next_part = 1
    part_obj = None

    while input.next():
        if (input.pos - 1) % number_of_sequences_per_file == 0:
            sys.stderr.write('\rCreating part: ~ %s' % (pp(next_part)))
            sys.stderr.flush()

            if part_obj:
                part_obj.close()
            file_path = os.path.join(dest_dir, prefix + '-%08d' % next_part)
            parts.append(file_path)
            next_part += 1
            part_obj = open(file_path, 'w')

        part_obj.write('>%s\n' % input.id)
        part_obj.write('%s\n' % input.seq)
  
    if part_obj:
        part_obj.close()

    sys.stderr.write('\n')
    return parts
Ejemplo n.º 7
0
def copy_file(source_file, dest_file):
    debug('%s; dest: "%s", src: "%s"' % (my_name(), source_file, dest_file))
    try:
        return shutil.copyfile(source_file, dest_file)
    except IOError, e:
        raise UtilsError, "copy failed due to the following reason: '%s' (src: %s, dst: %s)" \
                                        % (e, source_file, dest_file)
Ejemplo n.º 8
0
 def init_modules(self):
     mod_base = self.constants.dirs['modules']
     for file in os.listdir(mod_base):
         if file.startswith('mod_') and file.endswith('.py'):
             mod_name = file[4:-3]
             self.modules[mod_name] = imp.load_source(mod_name, os.path.join(mod_base, file))
             debug('module "%s" found' % mod_name)
Ejemplo n.º 9
0
def concatenate_files(dest_file, file_list):
    debug('%s; dest: "%s"' % (my_name(), dest_file))
    dest_file_obj = open(dest_file, 'w')
    for chunk_path in file_list:
        for line in open(chunk_path):
            dest_file_obj.write(line)

    return dest_file_obj.close()
Ejemplo n.º 10
0
def refine_b6(source_file, dest_file, params):
    # FIXME: check if source_file is a valid m8 output.
    debug('%s; dest: %s' % (my_name(), dest_file))
    try:
        b6 = B6Source(source_file)
    except IOError, e:
        raise UtilsError, "open failed due to the following reason: '%s' (src: %s)" \
                                        % (e, source_file)
Ejemplo n.º 11
0
def concatenate_files(dest_file, file_list):
    debug('%s; dest: "%s"' % (my_name(), dest_file))
    dest_file_obj = open(dest_file, 'w')
    for chunk_path in file_list:
        for line in open(chunk_path):
            dest_file_obj.write(line)

    return dest_file_obj.close()
Ejemplo n.º 12
0
 def init_modules(self):
     mod_base = self.constants.dirs['modules']
     for file in os.listdir(mod_base):
         if file.startswith('mod_') and file.endswith('.py'):
             mod_name = file[4:-3]
             self.modules[mod_name] = imp.load_source(
                 mod_name, os.path.join(mod_base, file))
             debug('module "%s" found' % mod_name)
Ejemplo n.º 13
0
def refine_b6(source_file, dest_file, params):
    # FIXME: check if source_file is a valid m8 output.
    debug('%s; dest: %s' % (my_name(), dest_file))
    try:
        b6 = B6Source(source_file)
    except IOError, e:
        raise UtilsError, "open failed due to the following reason: '%s' (src: %s)" \
                                        % (e, source_file)
Ejemplo n.º 14
0
def split_file(ids_file,
               source_file,
               filtered_dest_file,
               survived_dest_file,
               type='fasta'):
    """splits reads in input file into two files based on ids_file

       for read_id in input:
           if read_id in list_of_ids:
               --> filtered_dest_file
           else:
               --> survived dest_file

       """
    debug('%s; src: "%s" (%s), filtered_dest: "%s", survived_dest: "%s"'\
        % (my_name(), source_file, type, filtered_dest_file, survived_dest_file))

    try:
        ids_to_filter = set([id.strip() for id in open(ids_file).readlines()])
    except IOError:
        raise FilterError, 'Hit IDs file missing ("%s").' \
                % (ids_to_filter)

    if type == 'fasta':

        STORE = lambda e, f: f.write('>%s\n%s\n' % (e.id, e.seq))

        input = SequenceSource(source_file)
        filtered_output = open(filtered_dest_file, 'w')
        survived_output = open(survived_dest_file, 'w')
        filtered_count, survived_count = 0, 0

        while input.next():
            if input.pos % 10000 == 0 or input.pos == 1:
                sys.stderr.write('\rSplitting FASTA file: ~ %s' %
                                 (pp(input.pos)))
                sys.stderr.flush()

            if input.id in ids_to_filter:
                ids_to_filter.remove(input.id)
                STORE(input, filtered_output)
                filtered_count += 1
            else:
                STORE(input, survived_output)
                survived_count += 1

        sys.stderr.write('\n')
        filtered_output.close()
        survived_output.close()

        debug('%s; done. of %s total reads, filtered: %s, survived: %s.'\
                    % (my_name(), pp(filtered_count + survived_count),\
                       pp(filtered_count), pp(survived_count)))

    else:
        raise UtilsError, "type '%s' is not implemented" % (type)

    return True
Ejemplo n.º 15
0
def run_command(cmdline):
    debug('%s; cmd: %s' % (my_name(), cmdline))
    try:
        if subprocess.call(cmdline, shell=True) < 0:
            raise UtilsError, "command was terminated by signal: %d" % (
                -retcode)
    except OSError, e:
        raise UtilsError, "command was failed for the following reason: '%s' ('%s')" % (
            e, cmdline)
def search(m):
    parts = m.files['parts']
    for part in parts:
        params = {'input': part, 'output': part + '.b6', 'target': m.target_db, 
                  'log': part + '.log', 'cmdparams': ' '.join(m.cmdparams)}
        debug('searching part %d/%d (log: %s)' % (parts.index(part) + 1, len(parts), params['log']))
        cmdline = SEARCH_COMMAND % params
        utils.run_command(cmdline)
    
    dest_file = m.files['search_output']
    utils.concatenate_files(dest_file, [part + '.b6' for part in m.files['parts']])
Ejemplo n.º 17
0
def split_file(ids_file, source_file, filtered_dest_file, survived_dest_file, type = 'fasta'):
    """splits reads in input file into two files based on ids_file

       for read_id in input:
           if read_id in list_of_ids:
               --> filtered_dest_file
           else:
               --> survived dest_file

       """
    debug('%s; src: "%s" (%s), filtered_dest: "%s", survived_dest: "%s"'\
        % (my_name(), source_file, type, filtered_dest_file, survived_dest_file))
    
    try:
       ids_to_filter = set([id.strip() for id in open(ids_file).readlines()])
    except IOError:
       raise FilterError, 'Hit IDs file missing ("%s").' \
               % (ids_to_filter)
  
    if type == 'fasta':
        
        STORE = lambda e, f: f.write('>%s\n%s\n' % (e.id, e.seq))

        input  = SequenceSource(source_file)
        filtered_output = open(filtered_dest_file, 'w')
        survived_output = open(survived_dest_file, 'w')
        filtered_count, survived_count = 0, 0

        while input.next():
            if input.pos % 10000 == 0 or input.pos == 1:
                sys.stderr.write('\rSplitting FASTA file: ~ %s' % (pp(input.pos)))
                sys.stderr.flush()

            if input.id in ids_to_filter:
                ids_to_filter.remove(input.id)
                STORE(input, filtered_output)
                filtered_count += 1
            else:
                STORE(input, survived_output)
                survived_count += 1
       
        sys.stderr.write('\n')
        filtered_output.close()
        survived_output.close()

        debug('%s; done. of %s total reads, filtered: %s, survived: %s.'\
                    % (my_name(), pp(filtered_count + survived_count),\
                       pp(filtered_count), pp(survived_count)))
    
    else:
        raise UtilsError, "type '%s' is not implemented" % (type)   
    
    return True
Ejemplo n.º 18
0
def search(m):
    parts = m.files["parts"]
    for part in parts:
        params = {
            "input": part,
            "output": part + ".b6",
            "target": m.target_db,
            "log": part + ".log",
            "cmdparams": " ".join(m.cmdparams),
        }
        debug("searching part %d/%d (log: %s)" % (parts.index(part) + 1, len(parts), params["log"]))
        cmdline = SEARCH_COMMAND % params
        utils.run_command(cmdline)

    dest_file = m.files["search_output"]
    utils.concatenate_files(dest_file, [part + ".b6" for part in m.files["parts"]])
Ejemplo n.º 19
0
    def __init__(self, args, constants):
        if args:
            self.args = args
            self.constants = constants
            self.base_work_dir = self.args.base_work_dir.replace(' ', '_')
            self.dataset_name  = self.args.dataset_name.replace(' ', '_')
            self.input = self.args.input
            
            self.dataset_root_dir = os.path.join(self.base_work_dir, self.dataset_name)
            self.filters = []
            self.modules = {}

            debug('Initializing configuration')
            self.init_modules()
            self.init_essential_files_and_directories()
            self.init_filters_config(args.filters_config)
            self.init_chain_of_filters()
            debug('Config class is initialized with %d modules and %d filters'\
                                % (len(self.modules), len(self.filters)))
Ejemplo n.º 20
0
    def __init__(self, args, constants):
        if args:
            self.args = args
            self.constants = constants
            self.base_work_dir = self.args.base_work_dir.replace(' ', '_')
            self.dataset_name = self.args.dataset_name.replace(' ', '_')
            self.input = self.args.input

            self.dataset_root_dir = os.path.join(self.base_work_dir,
                                                 self.dataset_name)
            self.filters = []
            self.modules = {}

            debug('Initializing configuration')
            self.init_modules()
            self.init_essential_files_and_directories()
            self.init_filters_config(args.filters_config)
            self.init_chain_of_filters()
            debug('Config class is initialized with %d modules and %d filters'\
                                % (len(self.modules), len(self.filters)))
Ejemplo n.º 21
0
def delete_files_in_dir(dir):
    debug('%s; removing content of "%s"' % (my_name(), dir))
    for f in os.listdir(dir):
        os.unlink(os.path.join(dir, f))
Ejemplo n.º 22
0
    def init_filters_config(self, config_file_path):
        filters_config = ConfigParserWrapper(config_file_path)
        filters_config.read(config_file_path)
        for section in filters_config.sections():
            filter = Filter(section)
            filter.name = filters_config.get(section,
                                             'filter_name').replace(' ', '_')

            # check if the target database, which happens to be the section name,
            # exists
            if not (os.path.exists(section) and os.access(section, os.R_OK)):
                raise ConfigError, 'Bad target (file not found / no read permission): "%s"' % section

            # assign module
            module_from_config = filters_config.get(section, 'module')
            if not self.modules.has_key(module_from_config):
                raise ConfigError, 'Unknown module for filter "%s": "%s".\nAvailable modules:\n%s' \
                                   % (filter.name, module_from_config, ', '.join(self.modules.keys()))
            else:
                filter.module = self.modules[module_from_config]

            # check the availability of the functions and the execution order, if the default
            # behavior has been changed manually in the config file
            if filters_config.has_option(section, 'execute'):
                execute_list_from_config = [
                    e.strip()
                    for e in filters_config.get(section, 'execute').split(',')
                ]
                for item in execute_list_from_config:
                    if item not in filter.module.FUNCTIONS_ORDER:
                        raise ConfigError, 'Unknown function for module "%s" in "%s": "%s".\nAvailable functions: %s' \
                                   % (module_from_config, filter.name, item, ', '.join(filter.module.FUNCTIONS_ORDER))
                if len(execute_list_from_config) != len(
                        list(set(execute_list_from_config))):
                    raise ConfigError, 'Functions cannot be executed more than once: %s' \
                                   % (', '.join(execute_list_from_config))

                # make sure the order is right.
                t = [
                    filter.module.FUNCTIONS_ORDER.index(i)
                    for i in execute_list_from_config
                ]
                if False in [t[i] > t[i - 1] for i in range(1, len(t))]:
                    raise ConfigError, 'Order of functions to be executed is not correct: %s\nFunctions should follow this order: %s' \
                                   % (', '.join(execute_list_from_config), ', '.join(filter.module.FUNCTIONS_ORDER))

                filter.execution_order = execute_list_from_config

                debug(
                    'filter module functions execution order has been set: "%s"'
                    % (filter.execution_order))

            # store command line parameters from the config file
            for option in [
                    o for o in filters_config.options(section)
                    if o.startswith('cmdparam.')
            ]:
                param = '.'.join(option.split('.')[1:])
                opt = filters_config.get(section, option)
                filter.cmdparams.append('%s %s' % (param, opt))

            debug('command line params for filter "%s": %s ' %
                  (filter.name, filter.cmdparams))

            # store post-search refinement filters from the config file
            for option in [
                    o for o in filters_config.options(section)
                    if o.startswith('rfnparam.')
            ]:
                param = '.'.join(option.split('.')[1:])
                opt = filters_config.get(section, option)
                if param in filter.get_refinement_params():
                    filter.rfnparams[param] = filter.module.ALLOWED_RFNPARAMS[
                        param](opt)
                else:
                    raise ConfigError, 'Unknown refinement parameter for filter "%s": "%s"' \
                                   % (filter.name, param)

            debug('refinement line params for filter "%s": %s ' %
                  (filter.name, filter.rfnparams))

            # take care of file paths and directories
            J = lambda x: os.path.join(filter.dirs['output'], x)

            filter.dirs['output'] = os.path.join(self.dataset_root_dir,
                                                 filter.name)
            filter.dirs['parts'] = J('parts')
            filter.files['search_output'] = J('01_raw_hits.txt')
            filter.files['refined_search_output'] = J('02_refined_hits.txt')
            filter.files['hit_ids'] = J('03_hits.ids')
            filter.files['filtered_reads'] = J('04_filtered.fa')
            filter.files['survived_reads'] = J('05_survived.fa')

            self.filters.append(filter)
Ejemplo n.º 23
0
def delete_files_in_dir(dir):
    debug('%s; removing content of "%s"' % (my_name(), dir))
    for f in os.listdir(dir):
        os.unlink(os.path.join(dir, f))
Ejemplo n.º 24
0
    def init_filters_config(self, config_file_path):
        filters_config = ConfigParserWrapper(config_file_path)
        filters_config.read(config_file_path)
        for section in filters_config.sections():
            filter = Filter(section)
            filter.name = filters_config.get(section, 'filter_name').replace(' ', '_')
           
            # check if the target database, which happens to be the section name,
            # exists
            if not (os.path.exists(section) and os.access(section, os.R_OK)):
                raise ConfigError, 'Bad target (file not found / no read permission): "%s"' % section

            # assign module
            module_from_config = filters_config.get(section, 'module')
            if not self.modules.has_key(module_from_config):
                raise ConfigError, 'Unknown module for filter "%s": "%s".\nAvailable modules:\n%s' \
                                   % (filter.name, module_from_config, ', '.join(self.modules.keys()))
            else:
                filter.module = self.modules[module_from_config]

            # check the availability of the functions and the execution order, if the default 
            # behavior has been changed manually in the config file
            if filters_config.has_option(section, 'execute'):
                execute_list_from_config = [e.strip() for e in filters_config.get(section, 'execute').split(',')]
                for item in execute_list_from_config:
                    if item not in filter.module.FUNCTIONS_ORDER:
                        raise ConfigError, 'Unknown function for module "%s" in "%s": "%s".\nAvailable functions: %s' \
                                   % (module_from_config, filter.name, item, ', '.join(filter.module.FUNCTIONS_ORDER))
                if len(execute_list_from_config) != len(list(set(execute_list_from_config))):
                    raise ConfigError, 'Functions cannot be executed more than once: %s' \
                                   % (', '.join(execute_list_from_config))

                # make sure the order is right.
                t = [filter.module.FUNCTIONS_ORDER.index(i) for i in execute_list_from_config]
                if False in [t[i] > t[i - 1] for i in range(1, len(t))]:
                    raise ConfigError, 'Order of functions to be executed is not correct: %s\nFunctions should follow this order: %s' \
                                   % (', '.join(execute_list_from_config), ', '.join(filter.module.FUNCTIONS_ORDER))

                filter.execution_order = execute_list_from_config
                
                debug('filter module functions execution order has been set: "%s"' % (filter.execution_order))

            # store command line parameters from the config file
            for option in [o for o in filters_config.options(section) if o.startswith('cmdparam.')]:
                param = '.'.join(option.split('.')[1:])
                opt = filters_config.get(section, option)
                filter.cmdparams.append('%s %s' % (param, opt))
               
            debug('command line params for filter "%s": %s ' % (filter.name, filter.cmdparams))

            # store post-search refinement filters from the config file
            for option in [o for o in filters_config.options(section) if o.startswith('rfnparam.')]:
                param = '.'.join(option.split('.')[1:])
                opt = filters_config.get(section, option)
                if param in filter.get_refinement_params():
                    filter.rfnparams[param] = filter.module.ALLOWED_RFNPARAMS[param](opt)
                else:
                    raise ConfigError, 'Unknown refinement parameter for filter "%s": "%s"' \
                                   % (filter.name, param)
            
            debug('refinement line params for filter "%s": %s ' % (filter.name, filter.rfnparams))
            
            
            # take care of file paths and directories
            J = lambda x: os.path.join(filter.dirs['output'], x)
            
            filter.dirs['output']  = os.path.join(self.dataset_root_dir, filter.name)
            filter.dirs['parts'] = J('parts')
            filter.files['search_output'] = J('01_raw_hits.txt')
            filter.files['refined_search_output'] = J('02_refined_hits.txt')
            filter.files['hit_ids'] = J('03_hits.ids')
            filter.files['filtered_reads'] = J('04_filtered.fa')
            filter.files['survived_reads'] = J('05_survived.fa') 

            self.filters.append(filter)