Ejemplo n.º 1
0
def parse_context(orig_dir):
    """Parses the context for each model
    """
    bigwig = read_txt(os.path.join(orig_dir, "bigwig.txt"))
    tasks = read_txt(os.path.join(orig_dir, "chip.txt"))
    features = read_txt(os.path.join(orig_dir, "feature.txt"))
    meta_fname = os.path.join(orig_dir, "meta.txt")
    if os.path.exists(meta_fname):
        meta = read_txt(meta_fname)
        n_meta_features = len(meta)
        assert n_meta_features == 8
    else:
        meta = None
        n_meta_features = 0

    needs_gencode = "gencode" in features
    if needs_gencode:
        n_meta_features += 6
    seq_n_channels = 4 + len(bigwig)

    return {
        "bigwig": bigwig,
        "tasks": tasks,
        "features": features,
        "meta": meta,
        "needs_mappability": "Unique35" in bigwig,
        "needs_rnaseq": "meta" in features,
        "needs_gencode": needs_gencode,
        "needs_cell_line": ["bigwig"] != features,
        "needs_meta_features": n_meta_features > 0,
        "seq_n_channels": seq_n_channels,
        "n_meta_features": n_meta_features,
    }
Ejemplo n.º 2
0
    def __attrs_post_init__(self):
        """
        In case conda or pip are filenames pointing to existing files,
        read the files and populate the package names
        """
        if len(self.conda) == 1 and self.conda[0].endswith(".txt") and \
           os.path.exists(self.conda[0]):
            # found a conda txt file
            object.__setattr__(self, "conda", read_txt(self.conda[0]))

        if len(self.pip) == 1 and self.pip[0].endswith(".txt") and \
           os.path.exists(self.pip[0]):
            # found a pip txt file
            object.__setattr__(self, "pip", read_txt(self.pip[0]))
Ejemplo n.º 3
0
def all_models_to_test(src):
    """Returns a list of models to test

    By default, this method returns all the model. In case a model group has a
    `test_subset.txt` file present in the group directory, then testing is only
    performed for models listed in `test_subset.txt`.

    Args:
      src: Model source
    """
    txt_files = list_files_recursively(src.local_path, "test_subset", "txt")

    exclude = []
    include = []
    for x in txt_files:
        d = os.path.dirname(x)
        exclude += [d]
        include += [os.path.join(d, l)
                    for l in read_txt(os.path.join(src.local_path, x))]

    # try to load every model extra included -- will get tested downstream
    # for m in include:
    #     src.get_model_descr(m)

    models = src.list_models().model
    for excl in exclude:
        models = models[~models.str.startswith(excl)]
    return list(models) + include
Ejemplo n.º 4
0
def load_data(vcf_file, gtf_file, fasta_file,
              batch_size=32,
              num_workers=0,
              tmpdir='/tmp/KipoiSplice/'):
    """
    Args:
      vcf_file: Path to the input vcf file
      fasta_file: reference genome fasta file
      gtf_file: path to the GTF file required by the models (Ensemble)
      batch_size: batch size to use with all the models
      num_workers: number of workers to use for each model
      tmpdir (optional): path to the temporary directory where to store the predictions
    """
    #contains_conservation  is not optional here
    contains_conservation=True

    MODELS = ["MaxEntScan/3prime", "MaxEntScan/5prime", "HAL", "labranchor"]
    features = read_txt(os.path.join(this_path, "features.txt"))

    # Could also be generated on the fly from "MODELS"
    with open(os.path.join(this_path, "model_table_cols.json"), "r") as ifh:
        model_output_col_names = json.load(ifh)

    os.makedirs(tmpdir, exist_ok=True)
    tmpdir = tempfile.mkdtemp(dir=tmpdir)

    # Generate a vcf file for each model
    for model in MODELS:
        # One could even parallelize here using joblib for example
        out_vcf_fpath = os.path.join(tmpdir, model + ".vcf")
        ensure_dirs(out_vcf_fpath)
        dataloader_arguments = {"gtf_file": os.path.abspath(gtf_file),
                                "fasta_file": os.path.abspath(fasta_file)}
        if "rbp_eclip" in model:
            dataloader_arguments["use_linecache"] = True
        sel_scores = ["ref", "alt", "diff"]
        if model == "labranchor":
            sel_scores += ["logit_ref", "logit_alt"]
        score_variants(model,
                       dl_args=dataloader_arguments,
                       input_vcf=os.path.abspath(vcf_file),
                       output_vcf=out_vcf_fpath,
                       scores=sel_scores)

    # Gather the predictions from all the vcf files
    conservation_vcf = None
    if contains_conservation:
      conservation_vcf = vcf_file
    df = gather_vcfs(MODELS, tmpdir, max(num_workers, 1),
                     model_output_col_names,
                     conservation_vcf = conservation_vcf)

    # impute zeros, convert the pandas dataframe to the array
    X = preproc(df, features).astype(float)

    try:
        shutil.rmtree(tmpdir)
    except:
        pass

    return {
        "inputs": X,
        "metadata": {
            "variant": {
                "id": df["variant_id"].values,  # have the variant ID
                "chr": df["variant_chr"].values.astype(str),  # get the chromosome
                "pos": df["variant_pos"].values,  # get the position
                "ref": df["variant_ref"].values,  # get the reference allele
                "alt": df["variant_alt"].values,  # get the alternative allele
            }
        }
    }
Ejemplo n.º 5
0
def test_read_txt():
    lines = read_txt("tests/data/conda_requirements.txt")
    assert lines == ["conda_dep1", "conda_dep2"]
Ejemplo n.º 6
0
def parse_log(path):
    """Parse tfdragonn log file

    Args:
      path: file path to the log file

    Returns:
      pandas DataFrame
    """
    lines = read_txt(path)
    epochs = []
    auROCs = []
    auPRCs = []
    recalls = []
    num_positives_list = []
    num_negatives_list = []
    balanced_accuracies = []
    best_epoch = None
    arch_file = None
    weights_file = None
    for line in lines:
        if line.startswith("Epoch"):
            epochs.append(int(re.search('Epoch (.*):', line).group(1)))
        if line.startswith("Balanced Accuracy: "):
            balanced_accuracies.append(
                float(
                    re.search('Balanced Accuracy: (.*)%\tauROC',
                              line).group(1)) / 100)
            auROCs.append(
                float(re.search('auROC: (.*)\t auPRC', line).group(1)))
            auPRCs.append(float(re.search('auPRC: (.*)$', line).group(1)))
        if line.startswith("Recall at"):
            recalls.append(
                list(
                    map(
                        float,
                        re.search('FDR: (.*)%\tNum',
                                  line).group(1).split("% | "))))
            num_positives_list.append(
                int(re.search('Num Positives: (.*)\t', line).group(1)))
            num_negatives_list.append(
                int(re.search('Num Negatives: (.*)$', line).group(1)))
        if line.startswith("The best model's architecture and weights"):
            best_epoch = int(re.search('\(from epoch (.*)\)', line).group(1))
            arch_file = re.search('were saved to (.*) and', line).group(1)
            weights_file = re.search('json and (.*)$', line).group(1)

    recalls = np.array(recalls)
    if len(epochs) == 0 and len(auROCs) > 0:
        epochs = [None] * len(auROCs)

    dfo = pd.DataFrame.from_items([
        ("path", path),
        ("epoch", epochs),
        ("best_epoch", best_epoch),
        ("balanced_accuracy", balanced_accuracies),
        ("auROC", auROCs),
        ("auPRC", auPRCs),
        ("recall_at_5", recalls[:, 0]),
        ("recall_at_10", recalls[:, 1]),
        ("recall_at_25", recalls[:, 2]),
        ("recall_at_50", recalls[:, 3]),
        ("num_positives", num_positives_list),
        ("num_negatives", num_negatives_list),
        ("arch_file", arch_file),
        ("weights_file", weights_file),
    ])
    if best_epoch is None:
        dfo['best_epoch'] = dfo.iloc[dfo.auPRC.argmax()].epoch
    return dfo