Example #1
0
 def to_VarDBCSV(self):
     logger.info("Writing DB to CSV")
     self.reset_index(inplace=True)
     self["POS"] = self["POS"].astype(int)
     self.sort_values(["CHROM", "POS"], inplace=True)
     vdbpath = variantsDBPath.rsplit(".", maxsplit=1)[0]
     os.makedirs(variantsDBPath.rsplit("/", maxsplit=1)[0], exist_ok=True)
     for chrom in self["CHROM"].unique():
         self[self["CHROM"] == chrom].to_csv(vdbpath + str(chrom) + ".csv",
                                             index=False,
                                             float_format="%.5f")
     logger.info("DB construction complete")
Example #2
0
 def to_VarDBCSV(self):
     logger.info('Writing DB to CSV')
     self.reset_index(inplace=True)
     self['POS'] = self['POS'].astype(int)
     self.sort_values(['CHROM', 'POS'], inplace=True)
     vdbpath = variantsDBPath.rsplit('.', maxsplit=1)[0]
     os.makedirs(variantsDBPath.rsplit('/', maxsplit=1)[0], exist_ok=True)
     for chrom in self['CHROM'].unique():
         self[self['CHROM'] == chrom].to_csv(vdbpath + str(chrom) + '.csv',
                                             index=False,
                                             float_format='%.5f')
     logger.info('DB construction complete')
Example #3
0
 def to_VarDBXLS(self):
     logger.info('Writing DB to Excel')
     self.reset_index(inplace=True)
     self['POS'] = self['POS'].astype(int)
     self.sort_values(['CHROM', 'POS'], inplace=True)
     os.makedirs(variantsDBPath.rsplit('/', maxsplit=1)[0], exist_ok=True)
     vdbpath = variantsDBPath.rsplit('.', maxsplit=1)[0]
     output = pd.ExcelWriter(variantsDBPath)
     workbook = output.book
     datasheet = workbook.add_worksheet('VariantSDB')
     output.sheets['VariantsDB'] = datasheet
     formatpos = workbook.add_format({'num_format': '###,###,###'})
     self['POS'] = self['POS'].astype(int)
     datasheet.set_column('B:B', 15, formatpos)
     for chrom in self['CHROM'].unique():
         self[self['CHROM'] == chrom].to_excel(vdbpath + str(chrom) +
                                               '.csv',
                                               index=False,
                                               float_format='%.5f',
                                               merge_cells=False)
     output.save()
     logger.info('Xlsx DB construction complete')
Example #4
0
 def to_VarDBXLS(self):
     logger.info("Writing DB to Excel")
     self.reset_index(inplace=True)
     self["POS"] = self["POS"].astype(int)
     self.sort_values(["CHROM", "POS"], inplace=True)
     os.makedirs(variantsDBPath.rsplit("/", maxsplit=1)[0], exist_ok=True)
     vdbpath = variantsDBPath.rsplit(".", maxsplit=1)[0]
     output = pd.ExcelWriter(variantsDBPath)
     workbook = output.book
     datasheet = workbook.add_worksheet("VariantSDB")
     output.sheets["VariantsDB"] = datasheet
     formatpos = workbook.add_format({"num_format": "###,###,###"})
     self["POS"] = self["POS"].astype(int)
     datasheet.set_column("B:B", 15, formatpos)
     for chrom in self["CHROM"].unique():
         self[self["CHROM"] == chrom].to_excel(
             vdbpath + str(chrom) + ".csv",
             index=False,
             float_format="%.5f",
             merge_cells=False,
         )
     output.save()
     logger.info("Xlsx DB construction complete")
Example #5
0
    def buildDB(cls):
        def patientLister(db=None):
            vcfspath = []
            for dirpath, dirnames, filenames in os.walk(patientPath):
                for filename in [
                        f for f in filenames if f.lower().endswith("final.vcf")
                ]:
                    vcfspath.append(os.path.join(dirpath, filename))
            final_list = []
            for idx in range(len(vcfspath)):
                splitfn = vcfspath[idx].rsplit("/", maxsplit=1)[-1]
                if "_MODApy" in splitfn:
                    final_list = [
                        x for x in final_list
                        if splitfn.strip("_MODApy.final.vcf") +
                        ".final.vcf" not in x
                    ]
                    if not (any(splitfn in string for string in final_list)):
                        final_list.append(vcfspath[idx])
                else:
                    if not (any(
                            splitfn.strip(".final.vcf") in string
                            for string in final_list)):
                        final_list.append(vcfspath[idx])
            vcfspath = final_list
            try:
                vcfsnames = [cyvcf2.Reader(x).samples[0] for x in vcfspath]
            except:
                logger.info(
                    "No Sample name in one of the vcfs files. Using File Names Instead"
                )
                vcfsnames = [
                    x.rsplit("/", maxsplit=1)[-1].strip(".final.vcf")
                    for x in vcfspath
                ]

            if db is not None:
                addpatnames = [x for x in vcfsnames if x not in db.columns]
                if len(addpatnames) >= 1:
                    logger.info("Adding Patients: {}".format(
                        [x for x in addpatnames]))
                else:
                    logger.error("No Patients to Add")
                    exit(1)
                patientslist = [
                    x for x in vcfspath for y in addpatnames if y in x
                ]
            else:
                patientslist = vcfspath

            return patientslist

        def dbbuilder(patientslist, db=None):
            logger.info("Parsing Patients")
            pvcfs = ParsedVCF.mp_parser(*patientslist)
            pvcfs = [
                x[[
                    "CHROM",
                    "POS",
                    "REF",
                    "ALT",
                    "ZIGOSITY",
                    "GENE_NAME",
                    "HGVS.C",
                    "HGVS.P",
                ]] for x in pvcfs
            ]
            for df in pvcfs:
                if "ZIGOSITY" not in df.columns:
                    df["ZIGOSITY"] = "UNKWN"
            pvcfs = [
                x.rename(columns={"ZIGOSITY": x.name}) for x in pvcfs
                if "ZIGOSITY" in x.columns
            ]
            logger.info("Merging parsed patients toDB")
            if db is not None:
                db.drop(columns=["level_0", "index"],
                        errors="ignore",
                        inplace=True)
                db = db.reset_index()
                pvcfs.insert(0, db)
                db.drop(columns=["level_0", "index"],
                        inplace=True,
                        errors="ignore")
            pvcfs = [
                x.set_index([
                    "CHROM", "POS", "REF", "ALT", "GENE_NAME", "HGVS.C",
                    "HGVS.P"
                ]) for x in pvcfs
            ]
            tempdb1 = pd.concat(pvcfs, axis=1, join="outer")
            tempdb1 = (tempdb1.reset_index().groupby(
                ["CHROM", "POS", "REF", "ALT"]).agg({
                    "GENE_NAME": " | ".join,
                    "HGVS.P": " | ".join,
                    "HGVS.C": " | ".join,
                }).reset_index())
            pvcfs = [
                x.reset_index().drop(columns=["GENE_NAME", "HGVS.C", "HGVS.P"])
                for x in pvcfs
            ]
            pvcfs.insert(0, tempdb1)
            pvcfs = [
                x.set_index(["CHROM", "POS", "REF", "ALT"]) for x in pvcfs
            ]
            db = pd.concat(pvcfs, axis=1, join="outer")
            del tempdb1
            del pvcfs
            colslist = ["GENE_NAME", "HGVS.C", "HGVS.P"]
            for col in colslist:
                db[col] = db[col].apply(
                    lambda x: " | ".join(set(x.split(" | "))))
            db = db.reset_index().set_index([
                "CHROM", "POS", "REF", "ALT", "GENE_NAME", "HGVS.C", "HGVS.P"
            ])
            db.drop(columns=["index", "0", "level_0"],
                    inplace=True,
                    errors="ignore")
            db.replace({".": np.nan}, inplace=True)
            db = db.pipe(VariantsDB)
            db = db.calcfreqs()
            return db

        try:
            logger.info("Checking DB File")
            if variantsDBPath.rsplit(".")[-1].lower() == "xlsx":
                db = VariantsDB.from_exceldb(
                    variantsDBPath.rsplit("/", maxsplit=1)[0])
                patientslist = patientLister(db)
            elif variantsDBPath.rsplit(".")[-1].lower() == "csv":
                db = VariantsDB.from_csvdb(
                    variantsDBPath.rsplit("/", maxsplit=1)[0])
                patientslist = patientLister(db)
            else:
                logger.error("VariantsDBPath must be a xlsx or csv file")
                exit(1)
        except:
            exit()
            logger.info("No DB Found, Building new Variants DB")
            patientslist = patientLister()
            db = None
        sublists = [
            patientslist[i:i + int(cfg["GENERAL"]["cores"])]
            for i in range(0, len(patientslist), int(cfg["GENERAL"]["cores"]))
        ]
        for l in sublists:
            db = dbbuilder(l, db)
            db.to_VarDBCSV()
        return db
Example #6
0
    def buildDB(cls):
        def patientLister(db=None):
            vcfspath = []
            for dirpath, dirnames, filenames in os.walk(patientPath):
                for filename in [
                        f for f in filenames if f.lower().endswith('final.vcf')
                ]:
                    vcfspath.append(os.path.join(dirpath, filename))
            try:
                vcfsnames = [cyvcf2.Reader(x).samples[0] for x in vcfspath]
            except:
                logger.info(
                    'No Sample name in one of the vcfs files. Using File Names Instead'
                )
                vcfsnames = [
                    x.rsplit('/', maxsplit=1)[-1].strip('.final.vcf')
                    for x in vcfspath
                ]

            if db is not None:
                addpatnames = [x for x in vcfsnames if x not in db.columns]
                if len(addpatnames) >= 1:
                    logger.info('Adding Patients: {}'.format(
                        [x for x in addpatnames]))
                else:
                    logger.error('No Patients to Add')
                    exit(1)
                patientslist = [
                    x for x in vcfspath for y in addpatnames if y in x
                ]
            else:
                patientslist = vcfspath

            return patientslist

        def dbbuilder(patientslist, db=None):
            logger.info('Parsing Patients')
            pvcfs = ParsedVCF.mp_parser(*patientslist)
            pvcfs = [
                x[[
                    'CHROM', 'POS', 'REF', 'ALT', 'ZIGOSITY', 'GENE_NAME',
                    'HGVS.C', 'HGVS.P'
                ]] for x in pvcfs
            ]
            for df in pvcfs:
                if 'ZIGOSITY' not in df.columns:
                    df['ZIGOSITY'] = 'UNKWN'
            pvcfs = [
                x.rename(columns={'ZIGOSITY': x.name}) for x in pvcfs
                if 'ZIGOSITY' in x.columns
            ]
            logger.info('Merging parsed patients toDB')
            if db is not None:
                db = db.reset_index()
                pvcfs.insert(0, db)
            pvcfs = [
                x.set_index([
                    'CHROM', 'POS', 'REF', 'ALT', 'GENE_NAME', 'HGVS.C',
                    'HGVS.P'
                ]) for x in pvcfs
            ]
            tempdb1 = pd.concat(pvcfs, axis=1, join='outer')
            tempdb2 = tempdb1.reset_index().groupby(
                ['CHROM', 'POS', 'REF', 'ALT']).agg({
                    'GENE_NAME': ' | '.join,
                    'HGVS.P': ' | '.join,
                    'HGVS.C': ' | '.join
                }).reset_index()
            pvcfs2 = [
                x.reset_index().drop(columns=['GENE_NAME', 'HGVS.C', 'HGVS.P'])
                for x in pvcfs
            ]
            pvcfs2.insert(0, tempdb2)
            pvcfs2 = [
                x.set_index(['CHROM', 'POS', 'REF', 'ALT']) for x in pvcfs2
            ]
            db = pd.concat(pvcfs2, axis=1, join='outer')
            colslist = ['GENE_NAME', 'HGVS.C', 'HGVS.P']
            for col in colslist:
                db[col] = db[col].apply(
                    lambda x: ' | '.join(set(x.split(' | '))))
            db = db.reset_index().set_index([
                'CHROM', 'POS', 'REF', 'ALT', 'GENE_NAME', 'HGVS.C', 'HGVS.P'
            ])
            db.replace({'.': np.nan}, inplace=True)
            db = db.pipe(VariantsDB)
            db = db.calcfreqs()
            return db

        try:
            logger.info('Checking DB File')
            if variantsDBPath.rsplit('.')[-1].lower() == 'xlsx':
                db = VariantsDB.from_exceldb(
                    variantsDBPath.rsplit('/', maxsplit=1)[0])
                patientslist = patientLister(db)
            elif variantsDBPath.rsplit('.')[-1].lower() == 'csv':
                db = VariantsDB.from_csvdb(
                    variantsDBPath.rsplit('/', maxsplit=1)[0])
                patientslist = patientLister(db)
            else:
                logger.error('VariantsDBPath must be a xlsx or csv file')
                exit(1)
        except:
            exit()
            logger.info('No DB Found, Building new Variants DB')
            patientslist = patientLister()
            db = None
        sublists = [
            patientslist[i:i + int(cfg['GENERAL']['cores'])]
            for i in range(0, len(patientslist), int(cfg['GENERAL']['cores']))
        ]
        for l in sublists:
            db = dbbuilder(l, db)
        return db