Ejemplo n.º 1
0
    def __init__(self, ratios, args):
        self.__membership = None
        self.__organism = None
        self.config_params = args
        self.ratios = ratios
        if args['resume']:
            self.row_seeder = memb.make_db_row_seeder(args['out_database'])
            self.column_seeder = memb.make_db_column_seeder(args['out_database'])
        else:
            self.row_seeder = memb.make_kmeans_row_seeder(args['num_clusters'])
            self.column_seeder = microarray.seed_column_members
        self.__conn = None

        today = date.today()
        logging.info('Input matrix has # rows: %d, # columns: %d',
                     ratios.num_rows, ratios.num_columns)
        logging.info("# clusters/row: %d", args['memb.clusters_per_row'])
        logging.info("# clusters/column: %d", args['memb.clusters_per_col'])
        logging.info("# CLUSTERS: %d", args['num_clusters'])
        logging.info("use operons: %d", args['use_operons'])

        if args['MEME']['version']:
            logging.info('using MEME version %s', args['MEME']['version'])
        else:
            logging.error('MEME not detected - please check')
Ejemplo n.º 2
0
    def __init__(self, ratios, args_in):
        self.__membership = None
        self.__organism = None
        self.config_params = args_in
        self.ratios = ratios
        if args_in['resume']:
            self.row_seeder = memb.make_db_row_seeder(args_in['out_database'])
            
            if args_in['new_data_file'] == True: #data file has changed 
                self.column_seeder = microarray.seed_column_members
            else:
                self.column_seeder = memb.make_db_column_seeder(args_in['out_database'])
        else:
            self.row_seeder = memb.make_kmeans_row_seeder(args_in['num_clusters'])
            self.column_seeder = microarray.seed_column_members
        self.__conn = None

        today = date.today()
        logging.info('Input matrix has # rows: %d, # columns: %d',
                     ratios.num_rows, ratios.num_columns)
        logging.info("# clusters/row: %d", args_in['memb.clusters_per_row'])
        logging.info("# clusters/column: %d", args_in['memb.clusters_per_col'])
        logging.info("# CLUSTERS: %d", args_in['num_clusters'])
        logging.info("use operons: %d", args_in['use_operons'])

        if args_in['MEME']['version']:
            logging.info('using MEME version %s', args_in['MEME']['version'])
        else:
            logging.error('MEME not detected - please check')
Ejemplo n.º 3
0
    def __init__(self, organism_code, ratio_matrix,
                 string_file=None,
                 num_clusters=None):
        logging.basicConfig(format=LOG_FORMAT,
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.DEBUG)
        self.__membership = None
        self.__organism = None
        self.config_params = {}
        self.ratio_matrix = ratio_matrix.sorted_by_row_name()

        # membership update default parameters
        # these come first, since a lot depends on clustering numbers
        self['memb.clusters_per_row'] = 2
        if num_clusters == None:
            num_clusters = int(round(self.ratio_matrix.num_rows *
                                     self['memb.clusters_per_row'] / 20.0))
        #self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0))
        self['memb.clusters_per_col'] = int(round(num_clusters / 2.0))
        self['memb.prob_row_change'] = 0.5
        self['memb.prob_col_change'] = 1.0
        self['memb.max_changes_per_row'] = 1
        self['memb.max_changes_per_col'] = 5

        self['organism_code'] = organism_code
        self['num_clusters'] = num_clusters
        logging.info("# CLUSTERS: %d", self['num_clusters'])

        # defaults
        self.row_seeder = memb.make_kmeans_row_seeder(num_clusters)
        self.column_seeder = microarray.seed_column_members
        self['row_scaling'] = 6.0
        self['string_file'] = None
        self['cache_dir'] = CACHE_DIR
        self['output_dir'] = 'out'
        self['start_iteration'] = 1
        self['num_iterations'] = 2000
        self['multiprocessing'] = True
        # Quantile normalization is false by default in cMonkey-R
        self['quantile_normalize'] = True

        # used to select sequences and MEME
        self['sequence_types'] = ['upstream']
        self['search_distances'] = {'upstream': (-20, 150)}
        # used for background distribution and MAST
        self['scan_distances'] = {'upstream': (-30, 250)}

        # membership default parameters
        self['memb.min_cluster_rows_allowed'] = 3
        self['memb.max_cluster_rows_allowed'] = 70
        self['string_file'] = string_file
        self['out_database'] = self['output_dir'] + '/cmonkey_run.db'

        today = date.today()
        self.CHECKPOINT_INTERVAL = None
        self.__checkpoint_basename = "cmonkey-checkpoint-%s-%d%d%d" % (
            organism_code, today.year, today.month, today.day)
Ejemplo n.º 4
0
    def __init__(self, organism_code, ratio_matrix, num_clusters=None):

        self.__membership = None
        self.__organism = None
        self.config_params = {}
        self.ratio_matrix = ratio_matrix.sorted_by_row_name()

        # membership update default parameters
        # these come first, since a lot depends on clustering numbers
        self['memb.clusters_per_row'] = 2
        if num_clusters == None:
            num_clusters = int(round(self.ratio_matrix.num_rows() *
                                     self['memb.clusters_per_row'] / 20.0))
        self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0))
        self['memb.prob_row_change'] = 0.5
        self['memb.prob_col_change'] = 1.0
        self['memb.max_changes_per_row'] = 1
        self['memb.max_changes_per_col'] = 5

        self['organism_code'] = organism_code
        self['num_clusters'] = num_clusters
        #logging.info("# CLUSTERS: %d", self['num_clusters'])

        # defaults
        self.row_seeder = memb.make_kmeans_row_seeder(num_clusters)
        self.column_seeder = microarray.seed_column_members
        self['row_scaling'] = 6.0
        self['string_file'] = None
        self['cache_dir'] = CACHE_DIR
        self['output_dir'] = 'out'
        self['start_iteration'] = 1
        self['num_iterations'] = 2000
        self['multiprocessing'] = True

        # used to select sequences and MEME
        self['sequence_types'] = ['upstream']
        self['search_distances'] = {'upstream': (-20, 150)}
        # used for background distribution and MAST
        self['scan_distances'] = {'upstream': (-30, 250)}


        # membership default parameters
        self['memb.min_cluster_rows_allowed'] = 3
        self['memb.max_cluster_rows_allowed'] = 70

        today = date.today()
        self.CHECKPOINT_INTERVAL = None
        self.__checkpoint_basename = "cmonkey-checkpoint-%s-%d%d%d" % (
            organism_code, today.year, today.month, today.day)
        print "inited the bare bones main cMonkey instance"
Ejemplo n.º 5
0
    def __init__(self, organism_code, ratio_matrix, num_clusters=None):
        cmonkey_run.CMonkeyRun.__init__(self, organism_code, ratio_matrix, num_clusters)

        self.__membership = None
        self.__organism = None
        self.config_params = {}
        self.ratio_matrix = ratio_matrix.sorted_by_row_name()

        # membership update default parameters
        # these come first, since a lot depends on clustering numbers
        self['memb.clusters_per_row'] = 2
        if num_clusters == None:
            num_clusters = int(round(self.ratio_matrix.num_rows() *
                                     self['memb.clusters_per_row'] / 20.0))
        self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0))
        self['memb.prob_row_change'] = 0.5
        self['memb.prob_col_change'] = 1.0
        self['memb.max_changes_per_row'] = 1
        self['memb.max_changes_per_col'] = 5

        self['organism_code'] = organism_code
        self['num_clusters'] = num_clusters
        logging.info("\x1b[31mMain:\t\x1b[0m# CLUSTERS: %d", self['num_clusters'])

        # defaults
        self.row_seeder = memb.make_kmeans_row_seeder(num_clusters)
        self.column_seeder = microarray.seed_column_members
        self['row_scaling'] = 6.0
        self['string_file'] = None
        self['cache_dir'] = 'cache'
        self['output_dir'] = 'out'
        self['start_iteration'] = 1
        self['num_iterations'] = 2000
        self['multiprocessing'] = True

        # membership default parameters
        self['memb.min_cluster_rows_allowed'] = 10
        self['memb.max_cluster_rows_allowed'] = 70

        self['sequence_types'] = ['Promoter', '3pUTR']
        self['search_distances'] = {'Promoter': (-1000, 200), '3pUTR': (0, 500)}
        # used for background distribution and MAST
        self['scan_distances'] = {'Promoter': (-2000, 750), '3pUTR': (0, 750)}
        logging.info("\x1b[31mMain:\t\x1b[0mcM object initialized")


        today = date.today()
        self.CHECKPOINT_INTERVAL = None
        self.__checkpoint_basename = "cmonkey-checkpoint-%s-%d%d%d" % (
            organism_code, today.year, today.month, today.day)
Ejemplo n.º 6
0
    def __init__(self,
                 organism_code,
                 ratio_matrix,
                 string_file=None,
                 num_clusters=None,
                 rsat_organism=None,
                 log_filename=None,
                 remap_network_nodes=False,
                 ncbi_code=None,
                 operon_file=None,
                 rsat_dir=None):
        logging.basicConfig(format=LOG_FORMAT,
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.DEBUG,
                            filename=log_filename)
        self.__membership = None
        self.__organism = None
        self.config_params = {}
        self.ratio_matrix = ratio_matrix

        # membership update default parameters
        # these come first, since a lot depends on clustering numbers
        self['memb.clusters_per_row'] = 2
        if num_clusters is None:
            num_clusters = int(
                round(self.ratio_matrix.num_rows *
                      self['memb.clusters_per_row'] / 20.0))
        if ratio_matrix.num_columns >= 60:
            self['memb.clusters_per_col'] = int(round(num_clusters / 2.0))
        else:
            self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 /
                                                      3.0))
        logging.info("# clusters/row: %d", self['memb.clusters_per_row'])
        logging.info("# clusters/column: %d", self['memb.clusters_per_col'])

        self['organism_code'] = organism_code
        self['num_clusters'] = num_clusters
        self['use_operons'] = True
        self['use_string'] = True
        self['global_background'] = True
        self['ncbi_code'] = ncbi_code
        self['remap_network_nodes'] = remap_network_nodes
        logging.info("# CLUSTERS: %d", self['num_clusters'])
        logging.info("use operons: %d", self['use_operons'])

        # defaults
        self.row_seeder = memb.make_kmeans_row_seeder(num_clusters)
        self.column_seeder = microarray.seed_column_members

        # file overrides
        self['string_file'] = string_file
        self['operon_file'] = operon_file

        self['rsat_organism'] = rsat_organism
        self['rsat_dir'] = rsat_dir

        # which scoring functions should be active
        self['donetworks'] = True
        self['domotifs'] = True

        today = date.today()
        self.__checkpoint_basename = "cmonkey-checkpoint-%d%d%d" % (
            today.year, today.month, today.day)
        self['meme_version'] = meme.check_meme_version()
        if self['meme_version']:
            logging.info('using MEME version %s', self['meme_version'])
        else:
            logging.error('MEME not detected - please check')

        if os.path.exists(USER_DEFAULT_PIPELINE_PATH):
            with open(USER_DEFAULT_PIPELINE_PATH) as infile:
                self['pipeline'] = json.load(infile)
Ejemplo n.º 7
0
    def __init__(self, organism_code, ratio_matrix,
                 string_file=None,
                 num_clusters=None,
                 rsat_organism=None,
                 log_filename=None,
                 remap_network_nodes=False,
                 ncbi_code=None,
                 operon_file=None,
                 rsat_dir=None):
        logging.basicConfig(format=LOG_FORMAT,
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.DEBUG,
                            filename=log_filename)
        self.__membership = None
        self.__organism = None
        self.config_params = {}
        self.ratio_matrix = ratio_matrix

        # membership update default parameters
        # these come first, since a lot depends on clustering numbers
        self['memb.clusters_per_row'] = 2
        if num_clusters is None:
            num_clusters = int(round(self.ratio_matrix.num_rows *
                                     self['memb.clusters_per_row'] / 20.0))
        if ratio_matrix.num_columns >= 60:
            self['memb.clusters_per_col'] = int(round(num_clusters / 2.0))
        else:
            self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0))
        logging.info("# clusters/row: %d", self['memb.clusters_per_row'])
        logging.info("# clusters/column: %d", self['memb.clusters_per_col'])

        self['organism_code'] = organism_code
        self['num_clusters'] = num_clusters
        self['use_operons'] = True
        self['use_string'] = True
        self['global_background'] = True
        self['ncbi_code'] = ncbi_code
        self['remap_network_nodes'] = remap_network_nodes
        logging.info("# CLUSTERS: %d", self['num_clusters'])
        logging.info("use operons: %d", self['use_operons'])

        # defaults
        self.row_seeder = memb.make_kmeans_row_seeder(num_clusters)
        self.column_seeder = microarray.seed_column_members

        # file overrides
        self['string_file'] = string_file
        self['operon_file'] = operon_file

        self['rsat_organism'] = rsat_organism
        self['rsat_dir'] = rsat_dir

        # which scoring functions should be active
        self['donetworks'] = True
        self['domotifs'] = True

        today = date.today()
        self.__checkpoint_basename = "cmonkey-checkpoint-%d%d%d" % (
            today.year, today.month, today.day)
        self['meme_version'] = meme.check_meme_version()
        if self['meme_version']:
            logging.info('using MEME version %s', self['meme_version'])
        else:
            logging.error('MEME not detected - please check')

        if os.path.exists(USER_DEFAULT_PIPELINE_PATH):
            with open(USER_DEFAULT_PIPELINE_PATH) as infile:
                self['pipeline'] = json.load(infile)