Example #1
0
def dendropy_newick_from_dist_matrix(infile, outfile, method):
    logging.info("Calculating tree using dendropy")
    logging.info(f"Loading distance matrix file {infile}")
    with utils.open_file(infile) as f:
        # triphecta saves distance matrix in the standard "phylip" format.
        # First line the number of samples. There is no line of just sample
        # names. This means we need to skip the first line, and then tell
        # dendropy that the first line is not sample names.
        next(f)
        pdm = dendropy.PhylogeneticDistanceMatrix.from_csv(
            src=f, is_first_row_column_names=False, delimiter="\t")

    if method == "upgma":
        logging.info("Calculating upgma tree")
        tree = pdm.upgma_tree()
    elif method == "nj":
        logging.info("Calculating nj tree")
        tree = pdm.nj_tree()
    else:
        raise ValueError(
            f"Got method {method}, but must be upgma or nj. Cannot continue")

    logging.info(f"Writing tree to file {outfile}")
    with utils.open_file(outfile, "w") as f:
        print(
            tree.as_string("newick", suppress_rooting=True).replace("'", ""),
            end="",
            file=f,
        )
Example #2
0
def load_distance_matrix_file(infile):
    sample_names = []
    distances = {}

    with utils.open_file(infile) as f:
        for line_number, line in enumerate(f):
            if line_number == 0:
                try:
                    number_of_samples = int(line.rstrip())
                except:
                    raise RuntimeError(
                        f"Expected first line of distance matrix to contain a number only. Got this: {line}"
                    )

                sample_names = []
            elif line_number == 1:
                sample_names.append(line.split()[0])
                continue
            else:
                fields = line.rstrip().split("\t", maxsplit=line_number)
                sample_names.append(fields[0])
                for i in range(1, line_number):
                    distances[tuple(sorted([line_number - 1, i - 1]))] = float(
                        fields[i]
                    )

    if len(sample_names) != number_of_samples:
        raise RuntimeError(
            f"Expected {number_of_samples} samples in distance matrix file, but got {len(sample_names)}"
        )

    return sample_names, distances
Example #3
0
    def _write_triples_names_file(cls, triples, phenos, outfile):
        pheno_names = sorted(list(phenos.pheno_types.keys()))

        with utils.open_file(outfile, "w") as f:
            print(
                "triple_id",
                "case",
                "control1",
                "geno_dist1",
                "pheno_dist1",
                "control2",
                "geno_dist2",
                "pheno_dist2",
                *[f"case_{x}\tcontrol1_{x}\tcontrol2_{x}" for x in pheno_names],
                sep="\t",
                file=f,
            )
            for i, triple in enumerate(triples):
                print(
                    i + 1,
                    triple.case,
                    triple.control1.sample,
                    triple.control1.geno_dist,
                    triple.control1.pheno_dist,
                    triple.control2.sample,
                    triple.control2.geno_dist,
                    triple.control2.pheno_dist,
                    *[
                        f"{phenos[triple.case][x]}\t{phenos[triple.control1.sample][x]}\t{phenos[triple.control2.sample][x]}"
                        for x in pheno_names
                    ],
                    sep="\t",
                    file=f,
                )
Example #4
0
def vcf_to_variant_positions_to_mask_from_bed_file(vcf_file, bed_file):
    mask = _bed_mask_file_to_dict(bed_file)
    current_mask_chrom = None
    current_mask_index = None
    vcf_records_to_mask = {}
    with utils.open_file(vcf_file) as f:
        for line in f:
            if line.startswith("#"):
                continue

            chrom, pos, _, ref, _ = line.split("\t", maxsplit=4)
            if chrom not in mask:
                continue

            if chrom != current_mask_chrom:
                current_mask_chrom = chrom
                current_mask_index = 0

            vcf_start = int(pos) - 1
            vcf_end = vcf_start + len(ref) - 1

            while (current_mask_index < len(mask[current_mask_chrom])
                   and mask[current_mask_chrom][current_mask_index][1] <
                   vcf_start):
                current_mask_index += 1

            if (current_mask_index < len(mask[current_mask_chrom])
                    and mask[current_mask_chrom][current_mask_index][0] <=
                    vcf_end):
                if chrom not in vcf_records_to_mask:
                    vcf_records_to_mask[chrom] = set()
                vcf_records_to_mask[chrom].add(vcf_start)

    return vcf_records_to_mask
Example #5
0
def test_open_file():
    tmp_file = "tmp.open_file"
    subprocess.check_output(f"rm -f {tmp_file}", shell=True)
    with pytest.raises(OSError):
        with utils.open_file(tmp_file) as f:
            pass

    for tmp_file in "tmp.open_file", "tmp.open_file.gz":
        with utils.open_file(tmp_file, "w") as f:
            print("TEST", file=f)
            print("TEST2", file=f)
        assert os.path.exists(tmp_file)
        with utils.open_file(tmp_file) as f:
            lines = [x.rstrip() for x in f]
        assert lines == ["TEST", "TEST2"]
        os.unlink(tmp_file)
Example #6
0
    def _load_phenotypes_tsv_file(cls, infile):
        phenos = {}

        with utils.open_file(infile) as f:
            reader = csv.DictReader(f, delimiter="\t")
            if "sample" not in reader.fieldnames:
                raise RuntimeError(
                    f"Must have a 'sample' column in phenotypes file. Not found in file {infile}"
                )
            pheno_types = {
                x: set()
                for x in reader.fieldnames if x != "sample"
            }

            for row in reader:
                if row["sample"] in phenos:
                    raise RuntimeError(
                        f"Duplicate sample name '{row['sample']}' in phenotypes file {infile}"
                    )

                phenos[row["sample"]] = {
                    x: Phenotypes.convert_one_variable_string(row[x])
                    for x in row if x != "sample"
                }
                for p in pheno_types:
                    pheno_types[p].add(type(phenos[row["sample"]][p]))

        return phenos, pheno_types
Example #7
0
def load_variant_count_list_from_tsv(infile):
    variants = []
    with utils.open_file(infile) as f:
        reader = csv.DictReader(f, delimiter="\t")
        for d in reader:
            for key in d:
                d[key] = int(d[key])
            variants.append(VariantCounts(**d))
    return variants
Example #8
0
def sample_names_tsv_from_vcf_file_of_filenames(infile, outfile, threads=1):
    """Input is a file of VCF file names, one name per line.
    Writes a TSV file with columns sample_name, vcf_file"""
    with utils.open_file(infile) as f:
        vcf_files = [x.rstrip() for x in f.readlines()]

    logging.debug(
        f"Getting sample names from {len(vcf_files)} VCF files using {threads} thread(s)"
    )
    with multiprocessing.Pool(processes=threads) as p:
        sample_names = p.map(sample_name_from_vcf, vcf_files)

    assert len(vcf_files) == len(sample_names)
    logging.debug(f"Writing sample/vcf TSV file {outfile}")
    with utils.open_file(outfile, "w") as f:
        print("sample", "vcf_file", sep="\t", file=f)
        for sample, vcf_file in zip(sample_names, vcf_files):
            print(sample, vcf_file, sep="\t", file=f)
Example #9
0
def _bed_mask_file_to_dict(bed_file):
    mask = {}
    with utils.open_file(bed_file) as f:
        for line in f:
            chrom, start, end = line.rstrip().split("\t")
            if chrom not in mask:
                mask[chrom] = []
            mask[chrom].append((int(start), int(end) - 1))

    for l in mask.values():
        l.sort()

    return mask
Example #10
0
def write_distance_matrix_file(sample_names, distance_matrix, outfile):
    with utils.open_file(outfile, "w") as f:
        print(len(sample_names), file=f)
        for i, sample in enumerate(sample_names):
            out = []

            for j, sample2 in enumerate(sample_names):
                if i == j:
                    out.append(0)
                else:
                    out.append(distance_matrix[tuple(sorted([i, j]))])

            print(sample, *out, sep="\t", file=f)
Example #11
0
def sample_name_from_vcf(infile):
    """Gets sample name from VCF (in its #CHROM... line).
    Assumes the VCF file only conatins one sample"""
    logging.debug(f"Getting sample name from VCF file {infile}")
    with utils.open_file(infile) as f:
        for line in f:
            if line.startswith("#CHROM"):
                name = line.rstrip().split("\t")[-1]
                logging.debug(
                    f"Found sample name '{name}' from VCF file {infile}")
                return name

    raise RuntimeError(f"#CHROM line not found in file {infile}")
Example #12
0
    def write_template_constraints_json(self, outfile):
        constraints = {}
        for pheno, pheno_type in self.pheno_types.items():
            constraints[pheno] = {"must_be_same": True, "params": {}}
            if pheno_type == bool:
                constraints[pheno]["method"] = "equal"
            elif pheno_type == float:
                constraints[pheno]["method"] = "range"
                constraints[pheno]["params"] = {"low": 0, "high": 1}
            else:
                raise TypeError

        with utils.open_file(outfile, "w") as f:
            json.dump(constraints, f, sort_keys=True, indent=2)
Example #13
0
def _load_sample_distances_file_of_filenames(infile):
    sample_names = []
    distance_files = []
    expect_cols = {"sample", "distance_file"}
    with utils.open_file(infile) as f:
        reader = csv.DictReader(f, delimiter="\t")
        if not expect_cols.issubset(set(reader.fieldnames)):
            raise RuntimeError(
                f"Error reading distances file of filenames {infile}. Expected column names: {','.join(expect_cols)}. Got column names: {','.join(reader.fieldnames)}"
            )
        for row in reader:
            sample_names.append(row["sample"])
            distance_files.append(row["distance_file"])

    return sample_names, distance_files
Example #14
0
def _load_one_sample_distances_file(filename):
    """Loads a distance file into memory. Returns a list of tuples,
       where each tuple is (sample_name, distance)"""
    expect_cols = {"sample", "distance"}
    distances = []

    with utils.open_file(filename) as f:
        reader = csv.DictReader(f, delimiter="\t")
        if not expect_cols.issubset(set(reader.fieldnames)):
            raise RuntimeError(
                f"Error reading distances file {filename}. Expected column names: {','.join(expect_cols)}. Got column names: {','.join(reader.fieldnames)}"
            )
        for row in reader:
            distances.append((row["sample"], float(row["distance"])))

    return distances
Example #15
0
    def write_variants_of_interest_file(self,
                                        filename,
                                        vcf_records_to_mask=None):
        with utils.open_file(filename, "w") as f:
            print(
                "variant_id",
                "in_mask",
                "chrom",
                "pos",
                "ref",
                "alt",
                "case",
                "control1",
                "control2",
                sep="\t",
                file=f,
            )
            for i, variant in enumerate(self.variants):
                if i in self.variant_indexes_of_interest:
                    if (vcf_records_to_mask is not None
                            and variant.CHROM in vcf_records_to_mask and
                            variant.POS in vcf_records_to_mask[variant.CHROM]):
                        in_mask = 1
                    else:
                        in_mask = 0

                    print(
                        i + 1,
                        in_mask,
                        variant.CHROM,
                        variant.POS + 1,
                        variant.REF,
                        ",".join(variant.ALTS),
                        StrainTriple.genotype_to_string(
                            self.variant_calls["case"][i]),
                        StrainTriple.genotype_to_string(
                            self.variant_calls["control1"][i]),
                        StrainTriple.genotype_to_string(
                            self.variant_calls["control2"][i]),
                        sep="\t",
                        file=f,
                    )
Example #16
0
    def _write_variants_summary_file(cls, triples, outfile, vcf_records_to_mask=None):
        with utils.open_file(outfile, "w") as f:
            print(
                "variant_id",
                "in_mask",
                "chrom",
                "pos",
                "ref",
                "alt",
                "freq",
                *[f"Triple.{i+1}" for i in range(len(triples))],
                sep="\t",
                file=f,
            )

            for variant_index, variant in enumerate(triples[0].variants):
                if (
                    vcf_records_to_mask is not None
                    and variant.CHROM in vcf_records_to_mask
                    and variant.POS in vcf_records_to_mask[variant.CHROM]
                ):
                    in_mask = 1
                else:
                    in_mask = 0
                in_triples = [
                    (1 if variant_index in t.variant_indexes_of_interest else 0)
                    for t in triples
                ]
                freq = round(sum(in_triples) / len(in_triples), 4)
                print(
                    variant_index + 1,
                    in_mask,
                    variant.CHROM,
                    variant.POS + 1,
                    variant.REF,
                    ",".join(variant.ALTS),
                    freq,
                    *in_triples,
                    sep="\t",
                    file=f,
                )
Example #17
0
def load_variant_calls_from_vcf_file(infile, expected_variants=None):
    with utils.open_file(infile) as f:
        sample_name = None
        calls = []
        checking_variants = True

        if expected_variants is None:
            expected_variants = []
            checking_variants = False

        for line in f:
            if line.startswith("##CHROM"):
                sample_name = line.rstrip().split("\t")[-1]
            elif not line.startswith("#"):
                gt, variant = vcf_line_to_variant_and_gt(line)
                calls.append(gt)

                if checking_variants:
                    if len(calls) - 1 >= len(expected_variants):
                        raise RuntimeError(
                            f"Too many variants in VCF file {infile}. Expected {len(expected_variants)} but got at least one more than that, so stopping"
                        )
                    if expected_variants[len(calls) - 1] != variant:
                        raise RuntimeError(
                            f"Mismatch in variant calls. Expected to get {expected_variants[len(calls)]} but got {variant} in file {infile}. Cannot continue"
                        )
                else:
                    expected_variants.append(variant)

        if sample_name is not None:
            raise RuntimeError(
                f"Did not find sample name in VCF file {infile}. Cannot continue"
            )

        if len(expected_variants) != len(calls):
            raise RuntimeError(
                f"Expected {len(expected_variants)} calls in VCF file {infile} but got {len(calls)}"
            )

    return calls, expected_variants
Example #18
0
def save_variant_count_list_to_tsv(var_list, outfile):
    with utils.open_file(outfile, "w") as f:
        print(*VariantCounts._fields, sep="\t", file=f)
        for v in var_list:
            print(*v, sep="\t", file=f)
Example #19
0
def load_vcf_file_for_distance_calc(
    infile,
    only_use_pass=True,
    numeric_filters=None,
    het_to_hom_key="COV",
    het_to_hom_min_pc_depth=90.0,
    mask=None,
):
    """Loads VCF file, returning a numpy array of genotypes, of type uint16.
    0 means unknown genotype. >0 means the allele number (where 1=ref, 2=first alt,
    etc).
    Format of numeric_filters is {"key": (bool, N)}.
    eg "GT_CONF": (True, 10) would require a minimum GT_CONF of 10 to use the
    called genotype. Otherwise the genotype is zero"""
    if mask is None:
        mask = {}

    if numeric_filters is None:
        numeric_filters = {}

    data = []
    var_counts = {"hom": 0, "het": 0, "null": 0, "het_to_hom": 0}

    with utils.open_file(infile) as f:
        for line in f:
            if line.startswith("#"):
                continue
            fields = line.rstrip().split("\t")

            if fields[0] in mask and int(fields[1]) - 1 in mask[fields[0]]:
                continue

            if only_use_pass and fields[6] != "PASS":
                var_counts["null"] += 1
                data.append(0)
                continue

            try:
                info = dict(zip(fields[8].split(":"), fields[9].split(":")))
                genos = set(info["GT"].split("/"))
            except:
                raise RuntimeError(
                    f"Error parsing final two columns of VCF file {infile} at this line:\n{line}"
                )

            fail_filter = False

            for key, filt in numeric_filters.items():
                if key in info:
                    val = float(info[key])
                    if (filt[0] and val < filt[1]) or (not filt[0]
                                                       and val > filt[1]):
                        fail_filter = True
                        break

            if fail_filter or "." in genos:
                data.append(0)
                var_counts["null"] += 1
            elif len(genos) > 1:
                hom_allele = _convert_het_to_hom(genos, info, het_to_hom_key,
                                                 het_to_hom_min_pc_depth)
                if hom_allele is None:
                    var_counts["het"] += 1
                    data.append(0)
                else:
                    var_counts["het_to_hom"] += 1
                    data.append(hom_allele + 1)
            else:
                var_counts["hom"] += 1
                data.append(int(genos.pop()) + 1)

    logging.debug(f"loaded {infile}")
    var_counts = variant_counts.VariantCounts(**var_counts)
    return np.array(data, dtype=np.uint16), var_counts