def module_dbg_logger(msg: str): if ({module_name_simple, "*"} & set(CONFIG_GLOBAL["LOG_DBG_MODULES"])) and ( CONFIG_GLOBAL["LOG_LEVEL"] == "debug"): logger = getLogger() logger.setLevel(logging.DEBUG) debug(module_name_simple + ": " + msg) logger.setLevel(logging.INFO)
def setup(self, args): # setup env from .env file load_dotenv(Path().cwd() / '.env') # read config file config = self.read_config(args.config_file) # combine settings from args, os.environ, and config self.build_settings(args, os.environ, config) # set create pathes and set default values if self.UNCHECKED_PATH is not None: self.UNCHECKED_PATH = Path(self.UNCHECKED_PATH).expanduser() else: self.UNCHECKED_PATH = Path().cwd() if self.CHECKED_PATH is not None: self.CHECKED_PATH = Path(self.CHECKED_PATH).expanduser() self.LOG_LEVEL = self.LOG_LEVEL.upper() if self.LOG_PATH is not None: self.LOG_PATH = Path(self.LOG_PATH).expanduser() # setup logs colorlog.basicConfig( level=self.LOG_LEVEL, format=' %(log_color)s%(levelname)-8s : %(message)s%(reset)s') # set the path self.SCHEMA_PATH = Path(args.schema_path) self.SIMULATION_ROUND, self.PRODUCT, self.SECTOR = self.SCHEMA_PATH.parts[ 0:3] # fetch definitions pattern and schema self.DEFINITIONS = fetch_definitions(self.PROTOCOL_LOCATIONS.split(), self.SCHEMA_PATH) self.PATTERN = fetch_pattern(self.PROTOCOL_LOCATIONS.split(), self.SCHEMA_PATH) self.SCHEMA = fetch_schema(self.PROTOCOL_LOCATIONS.split(), self.SCHEMA_PATH) # log settings colorlog.debug(self)
def init(self): # Some preprocessing to speed up data access in ANOVA ic50_parse = self.ic50.df.copy().unstack().dropna() # for each drug, we store the IC50s (Y) and corresponding indices # of cosmic identifiers + since v0.13 the real indices # Create a dictionary version of the data # to be accessed per drug where NA have already been # removed. Each drug is a dictionary with 2 keys: # Y for the data and indices for the cosmicID where # there is an IC50 measured. self.ic50_dict = dict([(d, { 'indices': ic50_parse.loc[d].index, 'Y': ic50_parse.loc[d].values }) for d in self.ic50.drugIds]) cosmicIds = list(self.ic50.df.index) for key in self.ic50_dict.keys(): indices = [ cosmicIds.index(this) for this in self.ic50_dict[key]['indices'] ] self.ic50_dict[key]['real_indices'] = indices # save the tissues self._autoset_tissue_factor() # and MSI (Microsatellite instability) status of the samples. self._autoset_msi_factor() # and (growth) media factor self._autoset_media_factor() # dictionaries to speed up code. self.msi_dict = {} self.tissue_dict = {} self.media_dict = {} # fill the dictionaries for each drug once for all for drug_name in self.ic50.drugIds: # NOTE: indices are actually cosmid ids (not indices from 0 to N) indices = self.ic50_dict[drug_name]['indices'] # MSI, media and tissue are not large data files and can be stored # enterily if self.features.found_msi: self.msi_dict[drug_name] = self.msi_factor.loc[indices] if self.settings.include_media_factor: self.media_dict[drug_name] = self.media_factor.loc[indices] self.tissue_dict[drug_name] = self.tissue_factor.loc[indices] # some preprocessing for the OLS computation. # We create the dummies for the tissue factor once for all # Note that to agree with R convention, we have to resort the column # to agree with R convention that is a<B==b<c instead of # where A<B<C<a<b<c (in Python) self._tissue_dummies = pd.get_dummies(self.tissue_factor) columns = self._tissue_dummies.columns columns = sorted(columns, key=lambda s: s.lower()) columns = ['C(tissue)[T.' + x + ']' for x in columns] self._tissue_dummies.columns = columns if self.settings.include_media_factor: self._media_dummies = pd.get_dummies(self.media_factor) columns = self._media_dummies.columns columns = ['C(media)[T.' + x + ']' for x in columns] self._media_dummies.columns = columns for col in columns: self._tissue_dummies[col] = self._media_dummies[col] N = len(self._tissue_dummies) self._tissue_dummies['C(msi)[T.1]'] = [1] * N self._tissue_dummies['feature'] = [1] * N self._tissue_dummies.insert(0, 'Intercept', [1] * N) # drop first feature in the tissues that seems to be used as a # reference in the regression #tissues = [x for x in self._tissue_dummies.columns if 'tissue' in x] #self._tissue_dummies.drop(tissues[0], axis=1, inplace=True) """if self.settings.include_media_factor: # Drop first category in the media factor ?! like for tissues. # What is the rationale ? media = [x for x in self._tissue_dummies.columns if 'media' in x] self._tissue_dummies.drop(media[0], axis=1, inplace=True) """ # reset the buffer. self.individual_anova = {} if self.verbose and self._init_called is False: for this in ['tissue', 'media', 'msi', 'feature']: if this in self._get_analysis_mode(): logger.debug(this.upper() + " FACTOR : included") else: logger.debug(this.upper() + " FACTOR : NOT included") self._init_called = True
def main(): colorlog.debug("Which logging format do we have?")