def test_add_second_vcf_variant_set(self):
     # This VCF only has one Variant which is not in the first VCF
     vcf = VCF(f="tests/vcf_tests/test2.vcf",
               reference_set_id=self.reference_set.id,
               method="CORTEX")
     vcf.add_to_database()
     assert VariantSet.objects().count() == 3
     assert VariantCallSet.objects().count() == 2
     assert VariantCall.objects().count() == 42
     assert Variant.objects().count() == 22
     assert len(Variant.objects()[0].variant_sets) == 3
     assert len(
         Variant.objects.get(
             names="UNION_BC_k31_var_147").variant_sets) == 3
Exemple #2
0
 def _remove_variant_set(self, variant_set_name):
     vs = VariantSet.objects.get(name=variant_set_name,
                                 reference_set=self.reference_set)
     for call_set in VariantCallSet.objects(variant_sets=vs):
         call_set.variant_sets.remove(vs)
         call_set.save()
         # Remove calls from callsets that only have this variantset
         if len(call_set.variant_sets) < 2:
             VariantCall.objects(call_set=call_set).delete()
             call_set.delete()
     # Remove variants that are ONLY from this variant set
     Variant.objects(variant_sets=vs, variant_sets__size=2).delete()
     VariantSetMetadata.objects(variant_set=vs).delete()
     vs.delete()
 def test_add_add_variants_and_calls(self):
     vcf = VCF(f="tests/vcf_tests/test.vcf",
               reference_set_id=self.reference_set.id,
               method="CORTEX")
     vcf.add_to_database()
     assert VariantCall.objects().count() == 21
     assert Variant.objects().count() == 21
Exemple #4
0
def get_context(pos, kmer):
    context = []
    for variant in Variant.objects(start__ne=pos,
                                   start__gt=pos - kmer,
                                   start__lt=pos + kmer):
        for split_variant in variant.split():
            context.append(split_variant)
    return context
 def test_add_new_vcf_variant_set(self):
     vcf = VCF(f="tests/vcf_tests/test.vcf",
               reference_set_id=self.reference_set.id,
               method="CORTEX")
     vcf.add_to_database()
     # We create a global variant set as well as one for the individual VCF
     assert VariantSet.objects().count() == 2
     vs = VariantSet.objects()[0]
     assert len(Variant.objects()[0].variant_sets) == 2
     assert vs.name == "test.vcf"
 def test_add_second_vcf_variant_set(self):
     # This VCF only has one Variant which is not in the first VCF
     vcf = VCF(f="tests/vcf_tests/test3.vcf",
               reference_set_id=self.reference_set.id,
               method="CORTEX")
     vcf.add_to_database()
     assert VariantSet.objects().count() == 2
     assert VariantCallSet.objects().count() == 1
     assert VariantCall.objects().count() == 106
     assert Variant.objects().count() == 106
     assert Variant.snps().count() == 89
     assert Variant.indels().count() == 17
     assert Variant.insertions().count() == 8
     assert Variant.deletions().count() == 8
     assert Variant.ph_snps.count() == 1
Exemple #7
0
def run(parser, args):
    DB = connect('mykrobe-%s' % (args.db_name))
    if DB is not None:
        try:
            Variant.objects()
            logging.info(
                "Connected to mykrobe-%s" % (args.db_name))
        except (ServerSelectionTimeoutError):
            DB = None
            logging.warning(
                "Could not connect to database. Continuing without using genetic backgrounds")
    mutations = []
    reference = os.path.basename(args.reference_filepath).split('.fa')[0]
    if args.vcf:
        run_make_probes_from_vcf_file(args)
    elif args.genbank:
        aa2dna = GeneAminoAcidChangeToDNAVariants(
            args.reference_filepath,
            args.genbank)
        if args.text_file:
            with open(args.text_file, 'r') as infile:
                reader = csv.reader(infile, delimiter="\t")
                for row in reader:
                    gene, mutation_string, alphabet = row
                    if alphabet == "DNA":
                        protein_coding_var = False
                    else:
                        protein_coding_var = True
                    for var_name in aa2dna.get_variant_names(
                            gene, mutation_string, protein_coding_var):
                        mutation = Mutation(reference=reference,
                                            var_name=var_name,
                                            gene=aa2dna.get_gene(gene),
                                            mut=mutation_string)
                        mutations.append(mutation)
        else:
            for variant in args.variants:

                gene, mutation = variant.split("_")
                for var_name in aa2dna.get_variant_names(gene, mutation):
                    mutations.append(
                        Mutation(reference=reference,
                                 var_name=var_name,
                                 gene=gene,
                                 mut=mutation))
    else:
        if args.text_file:
            with open(args.text_file, 'r') as infile:
                reader = csv.reader(infile, delimiter="\t")
                for row in reader:
                    gene_name, pos, ref, alt, alphabet = row
                    if gene_name == "ref":
                        mutations.append(
                            Mutation(
                                reference=reference,
                                var_name="".join([ref, pos, alt])))
                    else:
                        mutations.append(
                            Mutation(
                                reference=reference,
                                var_name=row[0]))
        else:
            mutations.extend(Mutation(reference=reference, var_name=v)
                             for v in args.variants)
    al = AlleleGenerator(
        reference_filepath=args.reference_filepath,
        kmer=args.kmer)
    for enum, mut in enumerate(mutations):
        if enum % 100 == 0:
            logger.info(
                "%i of %i - %f%%" % (enum, len(mutations), round(100*enum/len(mutations), 2)))
        variant_panel = make_variant_probe(
            al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds)
        if variant_panel is not None:
            for i, ref in enumerate(variant_panel.refs):
                try:
                    gene_name = mut.gene.name
                except AttributeError:
                    gene_name = "NA"

                sys.stdout.write(
                    ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" %
                    (mut.mut, mut.variant.var_name, len(
                        variant_panel.alts), mut.reference, i, gene_name, mut.mut))
                sys.stdout.write("%s\n" % ref)

            for i, a in enumerate(variant_panel.alts):
                sys.stdout.write(">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" %
                                 (mut.mut, mut.variant.var_name, i, gene_name, mut.mut))

                sys.stdout.write("%s\n" % a)
        else:
            logging.warning(
                "All variants failed for %s_%s - %s" %
                (mut.gene, mut.mut, mut.variant))
Exemple #8
0
def run(parser, args):
    # There's no need to try to connect to database if we're not doing backgrounds
    if args.no_backgrounds:
        logger.info(
            "Not connecting to database, because --no-backgrounds option used")
        DB = None
    else:
        DB = connect("%s-%s" % (DB_PREFIX, args.db_name))

    if DB is not None:
        try:
            Variant.objects()
            logger.info("Connected to %s-%s" % (DB_PREFIX, args.db_name))
        except (ServerSelectionTimeoutError):
            DB = None
            logger.warning(
                "Could not connect to database. Continuing without using genetic backgrounds"
            )
    mutations = []
    lineages = set()
    reference = os.path.basename(args.reference_filepath).split(".fa")[0]
    if args.vcf:
        run_make_probes_from_vcf_file(args)
    elif args.genbank:
        aa2dna = GeneAminoAcidChangeToDNAVariants(args.reference_filepath,
                                                  args.genbank)
        if args.text_file:
            with open(args.text_file, "r") as infile:
                reader = csv.reader(infile, delimiter="\t")
                for row in reader:
                    gene, mutation_string, alphabet = row
                    if alphabet == "DNA":
                        protein_coding_var = False
                    else:
                        protein_coding_var = True
                    for var_name in aa2dna.get_variant_names(
                            gene, mutation_string, protein_coding_var):
                        mutation = Mutation(
                            reference=reference,
                            var_name=var_name,
                            gene=aa2dna.get_gene(gene),
                            mut=mutation_string,
                            protein_coding_var=protein_coding_var,
                        )
                        mutations.append(mutation)
        else:
            for variant in args.variants:

                gene, mutation = variant.split("_")
                for var_name in aa2dna.get_variant_names(gene, mutation):
                    mutations.append(
                        Mutation(
                            reference=reference,
                            var_name=var_name,
                            gene=gene,
                            mut=mutation,
                        ))
    else:
        if args.text_file:
            mutations, lineages = load_dna_vars_txt_file(
                args.text_file, reference)
            if args.lineage:
                with open(args.lineage, "w") as f:
                    json.dump(lineages, f, sort_keys=True, indent=2)
        else:
            mutations.extend(
                Mutation(reference=reference, var_name=v)
                for v in args.variants)

    al = AlleleGenerator(reference_filepath=args.reference_filepath,
                         kmer=args.kmer)
    for enum, mut in enumerate(mutations):
        if enum % 100 == 0:
            logger.info(
                "%i of %i - %f%%" %
                (enum, len(mutations), round(100 * enum / len(mutations), 2)))
        variant_panel = make_variant_probe(al,
                                           mut.variant,
                                           args.kmer,
                                           DB=DB,
                                           no_backgrounds=args.no_backgrounds)
        if variant_panel is not None:
            for i, ref in enumerate(variant_panel.refs):
                try:
                    gene_name = mut.gene.name
                except AttributeError:
                    gene_name = "NA"

                sys.stdout.write(
                    ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n"
                    % (
                        mut.mutation_output_name,
                        mut.variant.var_name,
                        len(variant_panel.alts),
                        mut.reference,
                        i,
                        gene_name,
                        mut.mutation_output_name,
                    ))
                sys.stdout.write("%s\n" % ref)

            for i, a in enumerate(variant_panel.alts):
                sys.stdout.write(
                    ">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % (
                        mut.mutation_output_name,
                        mut.variant.var_name,
                        i,
                        gene_name,
                        mut.mutation_output_name,
                    ))

                sys.stdout.write("%s\n" % a)
        else:
            logger.warning("All variants failed for %s_%s - %s" %
                           (mut.gene, mut.mutation_output_name, mut.variant))