Ejemplo n.º 1
0
    def populate_from_vcf(self):
        """
        """
        import gemini_annotate  # avoid circular dependencies
        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0
        self.skipped = 0
        extra_file, extraheader_file = gemini_annotate.get_extra_files(self.args.db)
        extra_headers = {}
        with open(extra_file, "w") as extra_handle:
            # process and load each variant in the VCF file
            for var in self.vcf_reader:
                if self.args.passonly and (var.FILTER is not None and var.FILTER != "."):
                    self.skipped += 1
                    continue
                (variant, variant_impacts, extra_fields) = self._prepare_variation(var)
                if extra_fields:
                    extra_handle.write("%s\n" % json.dumps(extra_fields))
                    extra_headers = self._update_extra_headers(extra_headers, extra_fields)
                # add the core variant info to the variant buffer
                self.var_buffer.append(variant)
                # add each of the impact for this variant (1 per gene/transcript)
                for var_impact in variant_impacts:
                    self.var_impacts_buffer.append(var_impact)

                buffer_count += 1
                # buffer full - time to insert into DB
                if buffer_count >= self.buffer_size:
                    sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                     str(self.counter) + " variants processed.\n")
                    database.insert_variation(self.c, self.var_buffer)
                    database.insert_variation_impacts(self.c,
                                                      self.var_impacts_buffer)
                    # binary.genotypes.append(var_buffer)
                    # reset for the next batch
                    self.var_buffer = []
                    self.var_impacts_buffer = []
                    buffer_count = 0
                self.v_id += 1
                self.counter += 1
        if extra_headers:
            with open(extraheader_file, "w") as out_handle:
                out_handle.write(json.dumps(extra_headers))
        else:
            os.remove(extra_file)
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " +
                         str(self.counter) + " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
Ejemplo n.º 2
0
    def populate_from_vcf(self):
        """
        """
        import gemini_annotate  # avoid circular dependencies
        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0
        self.skipped = 0
        extra_file, extraheader_file = gemini_annotate.get_extra_files(self.args.db)
        extra_headers = {}
        with open(extra_file, "w") as extra_handle:
            # process and load each variant in the VCF file
            for var in self.vcf_reader:
                if self.args.passonly and (var.FILTER is not None and var.FILTER != "."):
                    self.skipped += 1
                    continue
                (variant, variant_impacts, extra_fields) = self._prepare_variation(var)
                if extra_fields:
                    extra_handle.write("%s\n" % json.dumps(extra_fields))
                    extra_headers = self._update_extra_headers(extra_headers, extra_fields)
                # add the core variant info to the variant buffer
                self.var_buffer.append(variant)
                # add each of the impact for this variant (1 per gene/transcript)
                for var_impact in variant_impacts:
                    self.var_impacts_buffer.append(var_impact)

                buffer_count += 1
                # buffer full - time to insert into DB
                if buffer_count >= self.buffer_size:
                    sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                     str(self.counter) + " variants processed.\n")
                    database.insert_variation(self.c, self.var_buffer)
                    database.insert_variation_impacts(self.c,
                                                      self.var_impacts_buffer)
                    # binary.genotypes.append(var_buffer)
                    # reset for the next batch
                    self.var_buffer = []
                    self.var_impacts_buffer = []
                    buffer_count = 0
                self.v_id += 1
                self.counter += 1
        if extra_headers:
            with open(extraheader_file, "w") as out_handle:
                out_handle.write(json.dumps(extra_headers))
        else:
            os.remove(extra_file)
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " +
                         str(self.counter) + " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
Ejemplo n.º 3
0
    def populate_from_vcf(self):
        """
        """

        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        self.skipped = 0
        # need to keep the objects in memory since we just borrow it in python.
        obj_buffer = []

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            if not var.ALT or len(var.ALT) == 0:
                continue
            if len(var.ALT) > 1 and not self.seen_multi:
                self._multiple_alts_message()

            if self.args.passonly and (var.FILTER is not None
                                       and var.FILTER != "."):
                self.skipped += 1
                continue
            (variant, variant_impacts,
             extra_fields) = self._prepare_variation(var)
            variant.extend(
                extra_fields.get(e) for e in self._extra_effect_fields)
            obj_buffer.append(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            # buffer full - time to insert into DB
            if len(self.var_buffer) >= self.buffer_size:
                sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                 str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                obj_buffer = []
                self.var_buffer = []
                self.var_impacts_buffer = []
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) +
                         " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
Ejemplo n.º 4
0
    def populate_from_vcf(self):
        """
        """

        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        self.skipped = 0
        # need to keep the objects in memory since we just borrow it in python.
        obj_buffer = []

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            if not var.ALT or len(var.ALT) == 0:
                continue
            if len(var.ALT) > 1 and not self.seen_multi:
                self._multiple_alts_message()

            if self.args.passonly and (var.FILTER is not None and var.FILTER != "."):
                self.skipped += 1
                continue
            (variant, variant_impacts, extra_fields) = self._prepare_variation(var)
            variant.extend(extra_fields.get(e) for e in self._extra_effect_fields)
            obj_buffer.append(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            # buffer full - time to insert into DB
            if len(self.var_buffer) >= self.buffer_size:
                sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                 str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                obj_buffer = []
                self.var_buffer = []
                self.var_impacts_buffer = []
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " +
                         str(self.counter) + " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
Ejemplo n.º 5
0
    def populate_from_vcf(self):
        """
        """
        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0
        self.skipped = 0

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            if self.args.passonly and (var.FILTER is not None
                                       and var.FILTER != "."):
                self.skipped += 1
                continue
            (variant, variant_impacts) = self._prepare_variation(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            buffer_count += 1
            # buffer full - time to insert into DB
            if buffer_count >= self.buffer_size:
                sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                 str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
                buffer_count = 0
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) +
                         " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
Ejemplo n.º 6
0
    def populate_from_vcf(self):
        """
        """
        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0
        self.skipped = 0

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            if self.args.passonly and (var.FILTER is not None and var.FILTER != "."):
                self.skipped += 1
                continue
            (variant, variant_impacts) = self._prepare_variation(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            buffer_count += 1
            # buffer full - time to insert into DB
            if buffer_count >= self.buffer_size:
                sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                 str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
                buffer_count = 0
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " +
                         str(self.counter) + " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
Ejemplo n.º 7
0
    def populate_from_vcf(self):
        """
        """
        self.v_id = 1
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0
        num_samples = len(self.samples)

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            (variant, variant_impacts) = self._prepare_variation(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            # only infer genotypes if requested
            if not self.args.noload_genotypes and not self.args.no_genotypes:
                pass

            buffer_count += 1
            # buffer full - time to insert into DB
            if buffer_count >= self.buffer_size:
                sys.stderr.write(str(self.v_id) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c, \
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
                buffer_count = 0
            self.v_id += 1
        # final load to the database
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write(str(self.v_id) + " variants processed.\n")
Ejemplo n.º 8
0
    def populate_from_vcf(self):
        """
        """
        self.v_id = 1
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0
        num_samples = len(self.samples)

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            (variant, variant_impacts) = self._prepare_variation(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)
            
            # only infer genotypes if requested
            if not self.args.noload_genotypes and not self.args.no_genotypes:
                pass

            buffer_count += 1
            # buffer full - time to insert into DB
            if buffer_count >= self.buffer_size:
                sys.stderr.write(str(self.v_id) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c, \
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
                buffer_count = 0
            self.v_id += 1
        # final load to the database
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write(str(self.v_id) + " variants processed.\n")
Ejemplo n.º 9
0
    def populate_from_vcf(self):
        """
        """
        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            (variant, variant_impacts) = self._prepare_variation(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            buffer_count += 1
            # buffer full - time to insert into DB
            if buffer_count >= self.buffer_size:
                sys.stderr.write(str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
                buffer_count = 0
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write(str(self.counter) + " variants processed.\n")
Ejemplo n.º 10
0
    def populate_from_vcf(self):
        """
        """
        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        buffer_count = 0

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            (variant, variant_impacts) = self._prepare_variation(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            buffer_count += 1
            # buffer full - time to insert into DB
            if buffer_count >= self.buffer_size:
                sys.stderr.write(str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
                buffer_count = 0
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write(str(self.counter) + " variants processed.\n")
Ejemplo n.º 11
0
    def populate_from_vcf(self):
        """
        """
        import gemini_annotate as ga
        extra_vcf_fields = set()

        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        self.skipped = 0

        # we save the vcf in this chunk for extra annotations.
        self.extra_vcf_writer = ga.get_extra_vcf(self.args.db, self.vcf_reader, tempdir=self.args.tempdir)

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            if len(var.ALT) > 1 and not self.seen_multi:
                self._multiple_alts_message()

            if self.args.passonly and (var.FILTER is not None and var.FILTER != "."):
                self.skipped += 1
                continue
            (variant, variant_impacts, extra_fields) = self._prepare_variation(var)
            if extra_fields:
                var.INFO.update(extra_fields)
                self.extra_vcf_writer.write_record(var)
                extra_vcf_fields.update(extra_fields.keys())
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            for var_impact in variant_impacts:
                self.var_impacts_buffer.append(var_impact)

            # buffer full - time to insert into DB
            if len(self.var_buffer) >= self.buffer_size:
                sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                 str(self.counter) + " variants processed.\n")
                database.insert_variation(self.c, self.var_buffer)
                database.insert_variation_impacts(self.c,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                self.var_buffer = []
                self.var_impacts_buffer = []
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        database.insert_variation(self.c, self.var_buffer)
        database.insert_variation_impacts(self.c, self.var_impacts_buffer)
        sys.stderr.write("pid " + str(os.getpid()) + ": " +
                         str(self.counter) + " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")
        self.extra_vcf_writer.stream.close()
        if len(extra_vcf_fields) == 0:
            os.unlink(self.extra_vcf_writer.stream.name)
        else:
            with open(self.extra_vcf_writer.stream.name + ".fields", "w") as o:
                o.write("\n".join(list(extra_vcf_fields)))
Ejemplo n.º 12
0
    def populate_from_vcf(self):
        """
        """

        self.v_id = self._get_vid()
        self.counter = 0
        self.var_buffer = []
        self.var_impacts_buffer = []
        self.skipped = 0
        # need to keep the objects in memory since we just borrow it in python.
        obj_buffer = []
        reader = self.vcf_reader

        anno_keys = {}
        if self.args.anno_type in ("snpEff", "all"):
            if "ANN" in reader:
                desc = reader["ANN"]["Description"]
                parts = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))]
                anno_keys["ANN"] = parts
            elif "EFF" in reader:
                parts = [x.strip(" [])'(\"") for x in re.split("\||\(", reader["EFF"]["Description"].split(":", 1)[1].strip())]
                anno_keys["EFF"] = parts
            else:
                print "snpEff header not found"
        if self.args.anno_type in ("VEP", "all"):
            if "CSQ" in reader:
                parts = [x.strip(" [])'(\"") for x in re.split("\||\(",
                                                               reader["CSQ"]["Description"].split(":", 1)[1].strip())]
                anno_keys["CSQ"] = parts

        # process and load each variant in the VCF file
        for var in self.vcf_reader:
            if not var.ALT or len(var.ALT) == 0:
                continue
            if len(var.ALT) > 1 and not self.seen_multi:
                self._multiple_alts_message()

            if self.args.passonly and (var.FILTER is not None and var.FILTER != "."):
                self.skipped += 1
                continue
            (variant, variant_impacts, extra_fields) = self._prepare_variation(var, anno_keys)
            variant.update(extra_fields)
            [v_.update(extra_fields) for v_ in variant_impacts]
            obj_buffer.append(var)
            # add the core variant info to the variant buffer
            self.var_buffer.append(variant)
            # add each of the impact for this variant (1 per gene/transcript)
            self.var_impacts_buffer.extend(variant_impacts)

            # buffer full - time to insert into DB
            if len(self.var_buffer) >= self.buffer_size:
                database.insert_variation(self.c, self.metadata, self.var_buffer)

                sys.stderr.write("pid " + str(os.getpid()) + ": " +
                                 str(self.counter) + " variants processed.\n")
                database.insert_variation_impacts(self.c, self.metadata,
                                                  self.var_impacts_buffer)
                # binary.genotypes.append(var_buffer)
                # reset for the next batch
                obj_buffer = []
                self.var_buffer = []
                self.var_impacts_buffer = []
            self.v_id += 1
            self.counter += 1
        # final load to the database
        self.v_id -= 1
        if self.var_buffer:
            database.insert_variation(self.c, self.metadata, self.var_buffer)
            database.insert_variation_impacts(self.c, self.metadata, self.var_impacts_buffer)
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.counter) + " variants processed.\n")
        if self.args.passonly:
            sys.stderr.write("pid " + str(os.getpid()) + ": " +
                             str(self.skipped) + " skipped due to having the "
                             "FILTER field set.\n")