Beispiel #1
0
def create_vcf_for_annotation_all_samples(out_dir):

    out_vcf = '%s/%s.vcf' % (out_dir, 'all_samples')
    vcf_header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tALL\n"

    hotspot_vars = hotspot_mongo.get_hotspot_vars()

    with open(out_vcf, "w") as out_file:
        out_file.write(vcf_header)
        client, db = mongo.get_connection()
        for var in hotspot_vars:
            chrom, pos, ref, alt = var['CHROM'], var['POS'], var['REF'], var['ALT']

            if not hotspot_mongo.has_annotation(chrom, pos, ref, alt, db):
                chrom, pos, ref, alt = str(chrom), str(pos), ref, ",".join(alt)
                gt = './.'
                if gt is None:
                    gt = './.'

                list_entry = [chrom, pos, '.', str(ref), str(alt)]
                variant = list_entry + ['.', '.', '.', 'GT', gt]
                out_file.write("\t".join([str(val) for val in variant]) + "\n")
        client.close()

    return out_vcf
Beispiel #2
0
    def reconcile_hotspot_and_database(self, hotspot_file):
        self.project_config = config_mongo.get_project_config()

        client, db = mongo.get_connection()
        vcf_reader = vcf.Reader(open(hotspot_file, 'r'))
        for rec in vcf_reader:
            chrom, pos, ref, alt = int(rec.CHROM.strip("chr")), int(rec.POS), rec.REF, [str(alt) for alt in rec.ALT]

            if not hotspot_mongo.is_hotspot(chrom, pos, ref, alt, db):
                self.__reconcile(chrom, pos, ref, alt, db)
        client.close()
Beispiel #3
0
    def sample_variants_csv(self, sample, type):
        if not sampleinfo_mongo.is_sample(sample) or not variants_mongo.is_sample_loaded(sample, type):
            self.__log_sample_doesnt_exist()
            return

        out_path = "%s/%s.csv" % ( self.output_files_dir, sample)
        print out_path
        csv_writer = csv.writer(open(out_path, "w"), delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

        header = ['CHROM', 'POS', 'REF', 'ALT', 'GT', 'RSID', 'Gene',
                  'ExonicFunc', 'AAChange', 'FREQ', 'QC_Final', 'QC_Cov', 'QC_AF', 'In_Hotspot']
        csv_writer.writerow(header)

        client, db = mongo.get_connection()
        total_loaded_samples = variants_mongo.count_samples()

        for var in variants_mongo.get_sample_vars(sample, type, db):
            new_variant = {}
            chrom, pos, ref, alt = var['CHROM'], var['POS'], var['REF'], var['ALT']
            al1, al2 = genotypetools.get_genotype_alleles(ref, alt, var['GT_calc'])
            new_variant.update({'CHROM': chrom, 'POS': pos, 'REF': ref, 'ALT': ",".join(alt),
                                'GT': "/".join([al1, al2])})

            hotspot = hotspot_mongo.get_variant(chrom, pos, ref, alt, db)

            annot = hotspot['ANNOTATION'][0]

            new_variant.update({'RSID': annot['snp137NonFlagged'],
                                'Gene': annot['Gene_refGene'], 'ExonicFunc': annot['ExonicFunc_refGene'],
                                'AAChange': annot['AAChange_refGene']})
            if 'p.' in new_variant['AAChange']:
                    new_variant['AAChange'] = new_variant['AAChange'].split('p.')[1].split(",")[0]

            zygosity = hotspot['orig_stats']['zygosity']
            freq = sum([zygosity['het_count'], zygosity['het_alt_count'], zygosity['hom_count']]) / float(total_loaded_samples)
            final_qc, qc_cov, qc_af = var['FINAL_QC'], var['COV_QC'], var['AF_QC']

            if hotspot['orig_stats']['qc']['final_qc_count'] > 0:
                in_hotspot = "TRUE"
            else:
                in_hotspot = "FALSE"

            new_variant.update({"FREQ": freq, "QC_Final": final_qc, "QC_Cov": qc_cov, "QC_AF": qc_af,
                                "In_Hotspot": in_hotspot})

            out_row = [str(new_variant[field]) for field in header]
            csv_writer.writerow(out_row)
            #print "\t".join(out_row)

        return out_path
Beispiel #4
0
    def __load_sample_variants(self, sample, vcf_file):
        self.__log_loading_new_sample(sample, vcf_file)

        vcf_reader = vcf.Reader(open(vcf_file, 'r'))

        client, db = mongo.get_connection()
        for record in vcf_reader:
            variant_doc = self.get_variant_doc(record, sample, vcf_file)

            variants_mongo.add_variant(variant_doc, db)

            if self.variant_type == 'orig':
                hotspot_mongo.add_variant(variant_doc, db)
            elif self.variant_type == 'hotspot':
                hotspot_mongo.add_hotspot_variant(variant_doc, db)

        client.close()
Beispiel #5
0
    def add_sample_info(self, sample_info_file):

        if not os.path.isfile(sample_info_file):
            self.__log_invalid_file(sample_info_file)
            sys.exit(1)

        else:
            client, db = mongo.get_connection()

            with open(sample_info_file, 'r') as infile:
                header = infile.readline().strip().split()

                for line in infile:
                    new_sample = {header[i]: line.strip().split()[i] for i in range(len(line.strip().split()))}
                    new_sample.update({"PROJECT": self.project_config['project_name']})
                    sampleinfo_mongo.add_new_sample(new_sample, db)

            client.close()
Beispiel #6
0
    def create_vcf_files(self):
        num_processors = 10

        samples = sampleinfo_mongo.get_samples()

        client, db = mongo.get_connection()

        jobs = set()
        while len(samples) > 0:
            sample = samples.pop(0)

            p = Process(target=vcftools.create_vcf_gt_orig_no_qc, args=(sample, self.vtools_dir, db))
            jobs.add(p)
            p.start()

            if len(jobs) == num_processors:
                for j in jobs:
                    j.join()
                jobs.clear()

        client.close()
Beispiel #7
0
    def save_annotations(self, annovar_vcf):
        self.__log_saving_annotations()

        self.project_config = config_mongo.get_project_config()

        client, db = mongo.get_connection()

        with open(annovar_vcf, "r") as annov_in:
            line = annov_in.readline()
            if not line.startswith("#CHROM"):

                while line.startswith('##'):
                    line = annov_in.readline()
                header = line.strip().strip("#").split("\t")

                for line in annov_in:

                    chrom, pos, ref, alt, annotations = self.__process_annovar_line(line, header)
                    annotate_mongo.save_annotation(chrom, pos, ref, alt, annotations, db)

        client.close()
Beispiel #8
0
    def save_annotations(self, annovar_vcf):
        """
        This will save the annotations of a sample in the database.
        :param annovar_vcf:
        :return:
        """
        self.__log_saving_annotations()

        self.project_config = config_mongo.get_project_config()

        client, db = mongo.get_connection()

        with open(annovar_vcf, "r") as annov_in:
            line = annov_in.readline()

            if not line.startswith("#CHROM"):

                while line.startswith('##'):  # this reads through the junk lines
                    line = annov_in.readline()
                header = line.strip().strip("#").split("\t")

                for line in annov_in:
                    line = line.strip()

                    if line != "":
                        chrom, pos, ref, alt, annotations = self.__process_annovar_line(line, header)
                        annotate_mongo.save_annotation(chrom, pos, ref, alt, annotations, db)
            else:

                header = line.strip().strip("#").split("\t")
                for line in annov_in:
                    line = line.strip()

                    if line != "":
                        chrom, pos, ref, alt, annotations = self.__process_annovar_line(line, header)
                        annotate_mongo.save_annotation(chrom, pos, ref, alt, annotations, db)

        client.close()
Beispiel #9
0
def create_vcf_for_annotation(sample, type, out_dir):
    sample_vars = variants_mongo.get_sample_vars(sample, type)

    out_vcf = '%s/%s.vcf' % (out_dir, sample)
    with open(out_vcf, "w") as out_file:
        vcf_header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample
        out_file.write(vcf_header)

        client, db = mongo.get_connection()
        for var in sample_vars:
            chrom, pos, ref, alt = var['CHROM'], var['POS'], var['REF'], var['ALT']

            if not hotspot_mongo.has_annotation(chrom, pos, ref, alt, db):
                chrom, pos, ref, alt = str(chrom), str(pos), ref, ",".join(alt)
                gt = var['GT_orig']
                if gt is None:
                    gt = './.'

                list_entry = [chrom, pos, '.', str(ref), str(alt)]
                variant = list_entry + ['.', '.', 'DP=%s' % var['READ_DEPTH'], 'GT', gt]
                out_file.write("\t".join([str(val) for val in variant]) + "\n")
        client.close()
    return out_vcf
Beispiel #10
0
    def __parallel_process_vcf_files(self, vcf_files, num_processors):

        client, db = mongo.get_connection()
        variants_mongo.drop_variants_index(db)
        hotspot_mongo.index_hotspot(db)

        jobs = set()
        while len(vcf_files) > 0:
            args = vcf_files.pop(0)
            sample = args[0]
            vcf_file = args[1]

            p = Process(target=self.__load_sample_variants, args=(sample, vcf_file))
            jobs.add(p)
            p.start()

            if len(jobs) == num_processors:
                for j in jobs:
                    j.join()
                jobs.clear()

        variants_mongo.index_variants(db)
        client.close()
Beispiel #11
0
    def __get_unsaved_hotspot_vcf_files(self):
        hotspot_dir = self.project_config['hotspot_dir']

        output_dir = hotspot_dir + "/hotspot_output"

        vcf_files = glob(output_dir+"/*.vcf")

        final_vcf_files = []

        client, db = mongo.get_connection()
        for vcf_file in vcf_files:
            sample = os.path.basename(vcf_file).split(".")[0]

            if sampleinfo_mongo.is_sample(sample, db) and not \
                    variants_mongo.is_sample_loaded(sample, self.variant_type, db):
                self.__log_adding_hotspot_sample_to_queue(sample, vcf_file)
                final_vcf_files.append((sample, vcf_file))
            else:
                self.__log_hotspot_sample_already_loaded(sample)

        client.close()

        return final_vcf_files
Beispiel #12
0
    def load_all(self):

        if self.variant_type == 'orig':
            client, db = mongo.get_connection()

            vcf_files = sampleinfo_mongo.get_vcf_files()

            # CHECK IF THE VCFS ARE ALL VALID BEFORE STARTING
            for sample in vcf_files:
                vcf_file = vcf_files[sample]
                if not os.path.isfile(vcf_file):
                    self.__log_invalid_vcf_file(vcf_file)
                    sys.exit(1)

            pending_vcf_files = []
            for sample in vcf_files:
                print sample
                vcf_file = vcf_files[sample]

                is_loaded = variants_mongo.is_sample_loaded(sample, self.variant_type, db)
                if is_loaded:
                    self.__log_sample_already_loaded(sample)
                    continue
                else:
                    self.__log_adding_sample_to_queue(sample, vcf_file)
                    pending_vcf_files.append((sample, vcf_file))

            client.close()

        elif self.variant_type == 'hotspot':
            pending_vcf_files = self.__get_unsaved_hotspot_vcf_files()

        num_processors = 10
        self.__parallel_process_vcf_files(pending_vcf_files, num_processors)

        self.__log_successfully_loaded()