def train_tars(mode="prod"): # Interdict the Transformers cache # TODO: Make this cleaner somehow import os from keter.stage import FileSystemStage, get_path with FileSystemStage(): os.environ["TRANSFORMERS_CACHE"] = str( get_path("external") / "transformers") from keter.actors.flair import ChemicalUnderstandingTARS ChemicalUnderstandingTARS()
def drug_discovery_on_moses(mode="prod"): from tqdm.auto import tqdm import pandas as pd from keter.stage import FileSystemStage, get_path from keter.actors.sklearn import Analyzer from keter.datasets.raw import Moses from keter.interfaces.chemistry import create_jamstack with FileSystemStage(): if mode == "prod": analyzer = Analyzer() elif mode == "doc2vec": analyzer = Analyzer("doc2vec") elif mode == "lda": analyzer = Analyzer("lda") else: raise ValueError(f"Invalid mode: {mode}") moses = Moses().to_df()["SMILES"].tolist() last = 0 block_size = 107609 blocks = [] get_path("output").mkdir(parents=True, exist_ok=True) for i in tqdm( range(0, len(moses), block_size), total=len(moses) // block_size, unit="block", ): blocks.append(analyzer.analyze(moses[i:i + block_size])) pd.concat(blocks).reset_index(drop=True).to_parquet( get_path("output") / "moses_drugs.parquet") create_jamstack()
def train(self): tox_corpus = FlairTox21().to_corpus() self.model = TARSClassifier( task_name="Toxicity", label_dictionary=tox_corpus.make_label_dictionary(), document_embeddings="distilbert-base-uncased", ) trainer = ModelTrainer(self.model, tox_corpus) trainer.train( base_path=get_path("model") / self.filename, learning_rate=0.02, mini_batch_size=1, max_epochs=10, )
def to_csv(self) -> Sequence[str]: constructed_data_root = get_path("constructed") csv_file = (constructed_data_root / self.filename).with_suffix(".csv.xz") if csv_file.exists(): with lzma.open(csv_file, "rt") as fd: for line in fd: yield line.rstrip() return corona_deaths = CoronaDeathsUSA().to_df() corona_deaths = corona_deaths.rename( columns={ column: int(parse(column).timestamp()) for column in corona_deaths.columns if "/" in column }) timestamp_columns = [ column for column in corona_deaths.columns if isinstance(column, int) ] corona_deaths[timestamp_columns] = corona_deaths[ timestamp_columns].diff(axis=1) corona_deaths = corona_deaths.dropna(axis=1) constructed_data_root.mkdir(parents=True, exist_ok=True) fd = lzma.open(csv_file, "wt") for row in corona_deaths.iterrows(): _, series = row for column, val in series.items(): if isinstance(column, int): for record in construct_infection_records( column, val, series.Lat, series.Long_): fd.write(record + "\n") yield record fd.close()
def download(self, assay: str): if assay not in self.tox21_assays: raise ValueError(f"Not a valid Tox21 assay: {assay}") raw_dir = get_path("raw") / "tox21" raw_url = f"https://tripod.nih.gov/tox21/assays/download/{assay}.zip" return urlopen(raw_url).read()
def create_jamstack(): app.config["FREEZER_IGNORE_MIMETYPE_WARNINGS"] = True app.config["FREEZER_DESTINATION"] = get_path("output") / "static_html" freezer = Freezer(app) freezer.freeze()
def make_drug_db(): db.make_drug_db(get_path("output"))