def parse_context(orig_dir): """Parses the context for each model """ bigwig = read_txt(os.path.join(orig_dir, "bigwig.txt")) tasks = read_txt(os.path.join(orig_dir, "chip.txt")) features = read_txt(os.path.join(orig_dir, "feature.txt")) meta_fname = os.path.join(orig_dir, "meta.txt") if os.path.exists(meta_fname): meta = read_txt(meta_fname) n_meta_features = len(meta) assert n_meta_features == 8 else: meta = None n_meta_features = 0 needs_gencode = "gencode" in features if needs_gencode: n_meta_features += 6 seq_n_channels = 4 + len(bigwig) return { "bigwig": bigwig, "tasks": tasks, "features": features, "meta": meta, "needs_mappability": "Unique35" in bigwig, "needs_rnaseq": "meta" in features, "needs_gencode": needs_gencode, "needs_cell_line": ["bigwig"] != features, "needs_meta_features": n_meta_features > 0, "seq_n_channels": seq_n_channels, "n_meta_features": n_meta_features, }
def __attrs_post_init__(self): """ In case conda or pip are filenames pointing to existing files, read the files and populate the package names """ if len(self.conda) == 1 and self.conda[0].endswith(".txt") and \ os.path.exists(self.conda[0]): # found a conda txt file object.__setattr__(self, "conda", read_txt(self.conda[0])) if len(self.pip) == 1 and self.pip[0].endswith(".txt") and \ os.path.exists(self.pip[0]): # found a pip txt file object.__setattr__(self, "pip", read_txt(self.pip[0]))
def all_models_to_test(src): """Returns a list of models to test By default, this method returns all the model. In case a model group has a `test_subset.txt` file present in the group directory, then testing is only performed for models listed in `test_subset.txt`. Args: src: Model source """ txt_files = list_files_recursively(src.local_path, "test_subset", "txt") exclude = [] include = [] for x in txt_files: d = os.path.dirname(x) exclude += [d] include += [os.path.join(d, l) for l in read_txt(os.path.join(src.local_path, x))] # try to load every model extra included -- will get tested downstream # for m in include: # src.get_model_descr(m) models = src.list_models().model for excl in exclude: models = models[~models.str.startswith(excl)] return list(models) + include
def load_data(vcf_file, gtf_file, fasta_file, batch_size=32, num_workers=0, tmpdir='/tmp/KipoiSplice/'): """ Args: vcf_file: Path to the input vcf file fasta_file: reference genome fasta file gtf_file: path to the GTF file required by the models (Ensemble) batch_size: batch size to use with all the models num_workers: number of workers to use for each model tmpdir (optional): path to the temporary directory where to store the predictions """ #contains_conservation is not optional here contains_conservation=True MODELS = ["MaxEntScan/3prime", "MaxEntScan/5prime", "HAL", "labranchor"] features = read_txt(os.path.join(this_path, "features.txt")) # Could also be generated on the fly from "MODELS" with open(os.path.join(this_path, "model_table_cols.json"), "r") as ifh: model_output_col_names = json.load(ifh) os.makedirs(tmpdir, exist_ok=True) tmpdir = tempfile.mkdtemp(dir=tmpdir) # Generate a vcf file for each model for model in MODELS: # One could even parallelize here using joblib for example out_vcf_fpath = os.path.join(tmpdir, model + ".vcf") ensure_dirs(out_vcf_fpath) dataloader_arguments = {"gtf_file": os.path.abspath(gtf_file), "fasta_file": os.path.abspath(fasta_file)} if "rbp_eclip" in model: dataloader_arguments["use_linecache"] = True sel_scores = ["ref", "alt", "diff"] if model == "labranchor": sel_scores += ["logit_ref", "logit_alt"] score_variants(model, dl_args=dataloader_arguments, input_vcf=os.path.abspath(vcf_file), output_vcf=out_vcf_fpath, scores=sel_scores) # Gather the predictions from all the vcf files conservation_vcf = None if contains_conservation: conservation_vcf = vcf_file df = gather_vcfs(MODELS, tmpdir, max(num_workers, 1), model_output_col_names, conservation_vcf = conservation_vcf) # impute zeros, convert the pandas dataframe to the array X = preproc(df, features).astype(float) try: shutil.rmtree(tmpdir) except: pass return { "inputs": X, "metadata": { "variant": { "id": df["variant_id"].values, # have the variant ID "chr": df["variant_chr"].values.astype(str), # get the chromosome "pos": df["variant_pos"].values, # get the position "ref": df["variant_ref"].values, # get the reference allele "alt": df["variant_alt"].values, # get the alternative allele } } }
def test_read_txt(): lines = read_txt("tests/data/conda_requirements.txt") assert lines == ["conda_dep1", "conda_dep2"]
def parse_log(path): """Parse tfdragonn log file Args: path: file path to the log file Returns: pandas DataFrame """ lines = read_txt(path) epochs = [] auROCs = [] auPRCs = [] recalls = [] num_positives_list = [] num_negatives_list = [] balanced_accuracies = [] best_epoch = None arch_file = None weights_file = None for line in lines: if line.startswith("Epoch"): epochs.append(int(re.search('Epoch (.*):', line).group(1))) if line.startswith("Balanced Accuracy: "): balanced_accuracies.append( float( re.search('Balanced Accuracy: (.*)%\tauROC', line).group(1)) / 100) auROCs.append( float(re.search('auROC: (.*)\t auPRC', line).group(1))) auPRCs.append(float(re.search('auPRC: (.*)$', line).group(1))) if line.startswith("Recall at"): recalls.append( list( map( float, re.search('FDR: (.*)%\tNum', line).group(1).split("% | ")))) num_positives_list.append( int(re.search('Num Positives: (.*)\t', line).group(1))) num_negatives_list.append( int(re.search('Num Negatives: (.*)$', line).group(1))) if line.startswith("The best model's architecture and weights"): best_epoch = int(re.search('\(from epoch (.*)\)', line).group(1)) arch_file = re.search('were saved to (.*) and', line).group(1) weights_file = re.search('json and (.*)$', line).group(1) recalls = np.array(recalls) if len(epochs) == 0 and len(auROCs) > 0: epochs = [None] * len(auROCs) dfo = pd.DataFrame.from_items([ ("path", path), ("epoch", epochs), ("best_epoch", best_epoch), ("balanced_accuracy", balanced_accuracies), ("auROC", auROCs), ("auPRC", auPRCs), ("recall_at_5", recalls[:, 0]), ("recall_at_10", recalls[:, 1]), ("recall_at_25", recalls[:, 2]), ("recall_at_50", recalls[:, 3]), ("num_positives", num_positives_list), ("num_negatives", num_negatives_list), ("arch_file", arch_file), ("weights_file", weights_file), ]) if best_epoch is None: dfo['best_epoch'] = dfo.iloc[dfo.auPRC.argmax()].epoch return dfo