def to_df_by_assay(self, assay: str) -> pd.DataFrame: raw = cache("raw", Path("tox21") / f"{assay}.zip", lambda: self.download(assay)) raw_fd = BytesIO(raw) with ZipFile(raw_fd) as zip_fd: for inner_filename in zip_fd.namelist(): if inner_filename.endswith("aggregrated.txt"): with zip_fd.open(inner_filename) as inner_fd: return pd.read_csv(inner_fd, sep="\t", index_col=False)
def __init__(self, mode="bow"): model_file = f"{self.filename}_{mode}.pkz" if mode == "default": self.model = cache("model", model_file, self.train) elif mode == "bow": self.model = cache( "model", model_file, lambda: self.train( ChemicalLanguageHyperparameters.from_dict({ "vector_algo": "bow", "max_vocab": 5000, "max_ngram": 4 })), ) elif mode == "lda": self.model = cache( "model", model_file, lambda: self.train( ChemicalLanguageHyperparameters.from_dict({ "vector_algo": "lda", "topics": 1000 })), ) elif mode == "doc2vec": self.model = cache( "model", model_file, lambda: self.train( ChemicalLanguageHyperparameters.from_dict({ "doc_epochs": 300, "vec_dims": 512 })), ) else: raise ValueError("Invalid mode: " + mode)
def __init__(self, mode="prod"): model_file = f"{self.filename}_{mode}.pkz" if "doc2vec" in mode: self.preprocessor = ChemicalLanguage("doc2vec") elif "lda" in mode: self.preprocessor = ChemicalLanguage("lda") else: self.preprocessor = ChemicalLanguage("bow") if "test" in mode: self.safety, self.feasibility, self.bbbp = self.train( score=True, task_duration=12000 ) else: self.safety, self.feasibility, self.bbbp = cache( "model", model_file, self.train )
def __init__(self): self.preprocessor = ChemicalLanguage("bow") self.model = cache(MODEL_ROOT / self.filename, self.train)
def to_df(self) -> pd.DataFrame: name = self.filename + ".parquet" return cache("raw", name, self.download)
def to_df(self) -> pd.DataFrame: name = self.filename + ".parquet" return cache("constructed", name, self.construct)