コード例 #1
0
ファイル: annotate.py プロジェクト: vembrane/vembrane
def execute(args):
    with open(args.config, "r") as stream:
        try:
            config = yaml.safe_load(stream)
        except yaml.YAMLError as e:
            print(e, file=stderr)
            exit(1)

    # load annotation data
    ann_data = np.genfromtxt(
        config["annotation"]["file"],
        delimiter=config["annotation"].get("delimiter", "\t"),
        names=True,
        dtype=None,
        encoding=None,
    )
    # ann_data = pd.read_csv(config["annotation"]["file"], sep="\t", header=0)
    # ann_data = dict(tuple(ann_data.groupby("chrom")))

    # build expression
    expression = ",".join(
        f'{value["expression"]}'
        for value in map(lambda x: x["value"], config["annotation"]["values"])
    )
    expression = f"({expression})"

    with VariantFile(args.vcf) as vcf:
        # add new info
        for value in config["annotation"]["values"]:
            value = value["value"]
            vcf.header.add_meta(
                "INFO",
                items=[
                    ("ID", value["vcf_name"]),
                    ("Number", value["number"]),
                    ("Type", value["type"]),
                    ("Description", value["description"]),
                ],
            )

        fmt = {"vcf": "", "bcf": "b", "uncompressed-bcf": "u"}[args.output_fmt]
        with VariantFile(
            args.output,
            f"w{fmt}",
            header=vcf.header,
        ) as out:
            variants = annotate_vcf(
                vcf,
                expression,
                args.annotation_key,
                ann_data=ann_data,
                config=config,
            )
            for v in variants:
                out.write(v)
コード例 #2
0
def filter_vcf(
    vcf: VariantFile,
    expression: str,
    ann_key: str,
    keep_unmatched: bool = False,
    preserve_order: bool = False,
    auxiliary: Dict[str, Set[str]] = {},
    overwrite_number: Dict[str, str] = {},
) -> Iterator[VariantRecord]:

    env = Environment(expression, ann_key, vcf.header, auxiliary,
                      overwrite_number)

    events = set()
    info_keys = set(vcf.header.info.keys())

    record: VariantRecord
    for idx, record in enumerate(vcf):
        record, record_has_passed = test_and_update_record(
            env, idx, record, ann_key, keep_unmatched)
        if record_has_passed:
            is_bnd = "SVTYPE" in info_keys and record.info.get("SVTYPE",
                                                               None) == "BND"
            if is_bnd:
                event = record.info.get("EVENT", None)
                events.add(event)
            elif not preserve_order:
                # if preserve_order is True, \
                # we will output everything in the second pass instead
                yield record

    if len(events) > 0:
        # perform a second pass if the first pass filtered breakend (BND) events
        # since these are compound events which have to be filtered jointly
        vcf.reset()
        for idx, record in enumerate(vcf):
            is_bnd = "SVTYPE" in info_keys and record.info.get("SVTYPE",
                                                               None) == "BND"
            event = record.info.get("EVENT", None)

            if is_bnd:
                if event not in events:
                    # only bnds with a valid associated event need to be considered, \
                    # so skip the rest
                    continue
            else:
                if not preserve_order:
                    continue
            record, _ = test_and_update_record(env, idx, record, ann_key,
                                               keep_unmatched)
            yield record
コード例 #3
0
def execute(args):
    aux = read_auxiliary(args.aux)
    with VariantFile(args.vcf) as vcf:
        header: VariantHeader = vcf.header
        header.add_meta("vembraneVersion", __version__)
        # NOTE: If .modules.filter.execute might be used as a library function
        #       in the future, we should not record sys.argv directly below.
        header.add_meta(
            "vembraneCmd",
            "vembrane " +
            " ".join("'" + arg.replace("'", '"') + '"' if " " in arg else arg
                     for arg in sys.argv[1:]),
        )

        records = filter_vcf(
            vcf,
            args.expression,
            args.annotation_key,
            keep_unmatched=args.keep_unmatched,
            preserve_order=args.preserve_order,
            auxiliary=aux,
            overwrite_number=args.overwrite_number,
        )

        try:
            first_record = list(islice(records, 1))
        except VembraneError as ve:
            print(ve, file=stderr)
            exit(1)

        records = chain(first_record, records)

        if args.statistics is not None:
            records = statistics(records, vcf, args.statistics,
                                 args.annotation_key)

        fmt = {"vcf": "", "bcf": "b", "uncompressed-bcf": "u"}[args.output_fmt]
        with VariantFile(
                args.output,
                f"w{fmt}",
                header=header,
        ) as out:
            try:
                for record in records:
                    out.write(record)

            except VembraneError as ve:
                print(ve, file=stderr)
                exit(1)
コード例 #4
0
def remove_invalid_snv_ids(vcf_file):
    vcf = VariantFile(vcf_file)
    vcf_filename = basename(vcf_file)
    with TemporaryDirectory(dir=".") as wdir:
        temp_file = join(wdir, vcf_filename)
        with VariantFile(temp_file, 'w', header=vcf.header) as out:
            for line in vcf:
                snv_id = line.id
                if snv_id is not None and "_" in snv_id:
                    warning("Invalid SNV id found will be filtered out: %s" %
                            snv_id)
                    continue
                out.write(line)

        safe_rename(temp_file, vcf_file)
コード例 #5
0
    def save_samples_to_db(self):
        from vcf_uploading.types import SamplesDict
        from vcf_uploading.utils import are_samples_empty, parse_samples, save_record_to_db

        with transaction.atomic():
            self.saved = True
            self.save()

            logger.info("Trying to read VCF file with pysam")
            vcf: VariantFile = VariantFile(self.file.path)

            with transaction.atomic():
                first_iteration = True

                for i, record in enumerate(vcf.fetch()):
                    if i % 100 == 1:
                        logger.debug("{} records processed", i)

                    if first_iteration:
                        if are_samples_empty(record):
                            break
                        samples: Optional[SamplesDict] = parse_samples(
                            record, self)
                        if not samples:
                            logger.info("No new samples detected. Breaking")
                            break
                        first_iteration = False

                    save_record_to_db(record=record, samples=samples)

                logger.info("File is saved to the database")
                logger.debug("File.saved: {}", self.saved)
コード例 #6
0
    def get_samples(self) -> List[str]:
        vcf_file_path = Path(self.file.path)

        pysam_vcf: VariantFile = VariantFile(vcf_file_path)
        record = next(pysam_vcf.fetch())
        samples: List[str] = list(record.samples.keys())

        return samples
コード例 #7
0
    def calculate_statistics(self):
        """Calculate statistics of VCF file

        The following statistics are calculated
        1. Number of samples in file
        2. Number of REF matches
        3. Number of ALTs
        4. Number of missing genotypes

        :return samples_statistics: Dict[str, SampleStatistics]. Keys of the dictionary
          are samples' names. Values are dictionaries with keys:
          * n_refs: int — number of alleles that are identical to a reference
          * n_alts: int — number of alleles that are not identical to a reference
          * n_missing: int — number of alleles with unknown genotype
        """
        from vcf_uploading.types import SampleStatistics

        logger.info("Trying to read VCF file with pysam")
        vcf: VariantFile = VariantFile(self.file.path)

        self.n_refs = 0
        self.n_missing_genotypes = 0
        self.n_alts = 0
        self.n_samples = 0
        self.n_records = 0

        samples_statistics: Dict[str, SampleStatistics] = {}

        for i, record in enumerate(vcf.fetch()):
            for sample in record.samples:
                indices: Tuple[int] = record.samples[
                    sample].allele_indices  # e.g. (0, 1)

                n_refs = indices.count(0)
                n_missing = indices.count(None)
                n_alts = 2 - n_refs - n_missing

                self.n_refs += n_refs
                self.n_missing_genotypes += n_missing
                self.n_alts += n_alts

                if sample in samples_statistics:  # TODO: can we make it a defaultdict?
                    samples_statistics[sample]["n_refs"] += n_refs
                    samples_statistics[sample]["n_alts"] += n_alts
                    samples_statistics[sample]["n_missing"] += n_missing
                else:
                    samples_statistics[sample] = {}
                    samples_statistics[sample]["n_refs"] = 0
                    samples_statistics[sample]["n_alts"] = 0
                    samples_statistics[sample]["n_missing"] = 0

        if "record" in locals():  # Cycle executed at least once
            self.n_samples = len(record.samples)
            self.n_records = i + 1

        self.save()

        return samples_statistics
コード例 #8
0
    def find_similar_samples_in_db(self):
        from .utils import get_average_similarities
        logger.info("Trying to find similar samples in the DB for file {}",
                    self.file.name)
        similar_samples: Dict[str, Dict[str, List[float]]] = {}

        samples = self.get_samples()

        for sample in samples:
            similar_samples[sample] = defaultdict(list)

        vcf: VariantFile = VariantFile(self.file.path)
        # TODO: this can be done MUCH faster
        # We can check for a few tens of SNPs from file. If a database
        # sample has different genotypes in most of them, we can exclude it
        # from the further checks
        for i, record in enumerate(vcf):
            if i % 100 == 1:
                logger.info("{} records processed", i)
            snp = SNP.from_record(record)

            for sample_name, sample in record.samples.items():
                if snp is None:
                    for db_sample in Sample.objects.all():
                        similar_samples[sample_name][db_sample.cypher].append(
                            0)
                else:
                    if all(allele is None for allele in sample.alleles):
                        continue
                    db_samples_similarity = SNP.calculate_similarity_to_each_sample(
                        snp, sample.alleles)
                    for db_sample, similarity in db_samples_similarity.items():
                        similar_samples[sample_name][db_sample].append(
                            similarity)

        similarities: Dict[str, Dict[str, float]] = get_average_similarities(
            similar_samples)
        return similarities
コード例 #9
0
    def predict_nationality(self) -> Dict[str, Dict[str, float]]:
        """Predict nationalities for each sample in `self.file`

        :return samples_nationalities: Dict[str, Dict[str, float]] - a dictionary,
            where the keys are the samples, and the values are the prediction of
            nationalities. In the predictions, keys are nationalities, and values
            are their probabilities
        """
        logger.info("Predicting nationality for RawVCF")

        predictions = {}

        for sample in self.get_samples():
            logger.info("Predicting nationality for sample {}", sample)
            sample_vcf: VariantFile = VariantFile(self.file.path)
            sample_vcf.subset_samples([sample])

            predictor = FastNGSAdmixPredictor(sample_vcf)
            predictions[sample] = predictor.predict()

        logger.info("Returning nationality predictions")
        logger.debug("Predictions: {}", predictions)
        return predictions