def to_VarDBCSV(self): logger.info("Writing DB to CSV") self.reset_index(inplace=True) self["POS"] = self["POS"].astype(int) self.sort_values(["CHROM", "POS"], inplace=True) vdbpath = variantsDBPath.rsplit(".", maxsplit=1)[0] os.makedirs(variantsDBPath.rsplit("/", maxsplit=1)[0], exist_ok=True) for chrom in self["CHROM"].unique(): self[self["CHROM"] == chrom].to_csv(vdbpath + str(chrom) + ".csv", index=False, float_format="%.5f") logger.info("DB construction complete")
def to_VarDBCSV(self): logger.info('Writing DB to CSV') self.reset_index(inplace=True) self['POS'] = self['POS'].astype(int) self.sort_values(['CHROM', 'POS'], inplace=True) vdbpath = variantsDBPath.rsplit('.', maxsplit=1)[0] os.makedirs(variantsDBPath.rsplit('/', maxsplit=1)[0], exist_ok=True) for chrom in self['CHROM'].unique(): self[self['CHROM'] == chrom].to_csv(vdbpath + str(chrom) + '.csv', index=False, float_format='%.5f') logger.info('DB construction complete')
def to_VarDBXLS(self): logger.info('Writing DB to Excel') self.reset_index(inplace=True) self['POS'] = self['POS'].astype(int) self.sort_values(['CHROM', 'POS'], inplace=True) os.makedirs(variantsDBPath.rsplit('/', maxsplit=1)[0], exist_ok=True) vdbpath = variantsDBPath.rsplit('.', maxsplit=1)[0] output = pd.ExcelWriter(variantsDBPath) workbook = output.book datasheet = workbook.add_worksheet('VariantSDB') output.sheets['VariantsDB'] = datasheet formatpos = workbook.add_format({'num_format': '###,###,###'}) self['POS'] = self['POS'].astype(int) datasheet.set_column('B:B', 15, formatpos) for chrom in self['CHROM'].unique(): self[self['CHROM'] == chrom].to_excel(vdbpath + str(chrom) + '.csv', index=False, float_format='%.5f', merge_cells=False) output.save() logger.info('Xlsx DB construction complete')
def to_VarDBXLS(self): logger.info("Writing DB to Excel") self.reset_index(inplace=True) self["POS"] = self["POS"].astype(int) self.sort_values(["CHROM", "POS"], inplace=True) os.makedirs(variantsDBPath.rsplit("/", maxsplit=1)[0], exist_ok=True) vdbpath = variantsDBPath.rsplit(".", maxsplit=1)[0] output = pd.ExcelWriter(variantsDBPath) workbook = output.book datasheet = workbook.add_worksheet("VariantSDB") output.sheets["VariantsDB"] = datasheet formatpos = workbook.add_format({"num_format": "###,###,###"}) self["POS"] = self["POS"].astype(int) datasheet.set_column("B:B", 15, formatpos) for chrom in self["CHROM"].unique(): self[self["CHROM"] == chrom].to_excel( vdbpath + str(chrom) + ".csv", index=False, float_format="%.5f", merge_cells=False, ) output.save() logger.info("Xlsx DB construction complete")
def buildDB(cls): def patientLister(db=None): vcfspath = [] for dirpath, dirnames, filenames in os.walk(patientPath): for filename in [ f for f in filenames if f.lower().endswith("final.vcf") ]: vcfspath.append(os.path.join(dirpath, filename)) final_list = [] for idx in range(len(vcfspath)): splitfn = vcfspath[idx].rsplit("/", maxsplit=1)[-1] if "_MODApy" in splitfn: final_list = [ x for x in final_list if splitfn.strip("_MODApy.final.vcf") + ".final.vcf" not in x ] if not (any(splitfn in string for string in final_list)): final_list.append(vcfspath[idx]) else: if not (any( splitfn.strip(".final.vcf") in string for string in final_list)): final_list.append(vcfspath[idx]) vcfspath = final_list try: vcfsnames = [cyvcf2.Reader(x).samples[0] for x in vcfspath] except: logger.info( "No Sample name in one of the vcfs files. Using File Names Instead" ) vcfsnames = [ x.rsplit("/", maxsplit=1)[-1].strip(".final.vcf") for x in vcfspath ] if db is not None: addpatnames = [x for x in vcfsnames if x not in db.columns] if len(addpatnames) >= 1: logger.info("Adding Patients: {}".format( [x for x in addpatnames])) else: logger.error("No Patients to Add") exit(1) patientslist = [ x for x in vcfspath for y in addpatnames if y in x ] else: patientslist = vcfspath return patientslist def dbbuilder(patientslist, db=None): logger.info("Parsing Patients") pvcfs = ParsedVCF.mp_parser(*patientslist) pvcfs = [ x[[ "CHROM", "POS", "REF", "ALT", "ZIGOSITY", "GENE_NAME", "HGVS.C", "HGVS.P", ]] for x in pvcfs ] for df in pvcfs: if "ZIGOSITY" not in df.columns: df["ZIGOSITY"] = "UNKWN" pvcfs = [ x.rename(columns={"ZIGOSITY": x.name}) for x in pvcfs if "ZIGOSITY" in x.columns ] logger.info("Merging parsed patients toDB") if db is not None: db.drop(columns=["level_0", "index"], errors="ignore", inplace=True) db = db.reset_index() pvcfs.insert(0, db) db.drop(columns=["level_0", "index"], inplace=True, errors="ignore") pvcfs = [ x.set_index([ "CHROM", "POS", "REF", "ALT", "GENE_NAME", "HGVS.C", "HGVS.P" ]) for x in pvcfs ] tempdb1 = pd.concat(pvcfs, axis=1, join="outer") tempdb1 = (tempdb1.reset_index().groupby( ["CHROM", "POS", "REF", "ALT"]).agg({ "GENE_NAME": " | ".join, "HGVS.P": " | ".join, "HGVS.C": " | ".join, }).reset_index()) pvcfs = [ x.reset_index().drop(columns=["GENE_NAME", "HGVS.C", "HGVS.P"]) for x in pvcfs ] pvcfs.insert(0, tempdb1) pvcfs = [ x.set_index(["CHROM", "POS", "REF", "ALT"]) for x in pvcfs ] db = pd.concat(pvcfs, axis=1, join="outer") del tempdb1 del pvcfs colslist = ["GENE_NAME", "HGVS.C", "HGVS.P"] for col in colslist: db[col] = db[col].apply( lambda x: " | ".join(set(x.split(" | ")))) db = db.reset_index().set_index([ "CHROM", "POS", "REF", "ALT", "GENE_NAME", "HGVS.C", "HGVS.P" ]) db.drop(columns=["index", "0", "level_0"], inplace=True, errors="ignore") db.replace({".": np.nan}, inplace=True) db = db.pipe(VariantsDB) db = db.calcfreqs() return db try: logger.info("Checking DB File") if variantsDBPath.rsplit(".")[-1].lower() == "xlsx": db = VariantsDB.from_exceldb( variantsDBPath.rsplit("/", maxsplit=1)[0]) patientslist = patientLister(db) elif variantsDBPath.rsplit(".")[-1].lower() == "csv": db = VariantsDB.from_csvdb( variantsDBPath.rsplit("/", maxsplit=1)[0]) patientslist = patientLister(db) else: logger.error("VariantsDBPath must be a xlsx or csv file") exit(1) except: exit() logger.info("No DB Found, Building new Variants DB") patientslist = patientLister() db = None sublists = [ patientslist[i:i + int(cfg["GENERAL"]["cores"])] for i in range(0, len(patientslist), int(cfg["GENERAL"]["cores"])) ] for l in sublists: db = dbbuilder(l, db) db.to_VarDBCSV() return db
def buildDB(cls): def patientLister(db=None): vcfspath = [] for dirpath, dirnames, filenames in os.walk(patientPath): for filename in [ f for f in filenames if f.lower().endswith('final.vcf') ]: vcfspath.append(os.path.join(dirpath, filename)) try: vcfsnames = [cyvcf2.Reader(x).samples[0] for x in vcfspath] except: logger.info( 'No Sample name in one of the vcfs files. Using File Names Instead' ) vcfsnames = [ x.rsplit('/', maxsplit=1)[-1].strip('.final.vcf') for x in vcfspath ] if db is not None: addpatnames = [x for x in vcfsnames if x not in db.columns] if len(addpatnames) >= 1: logger.info('Adding Patients: {}'.format( [x for x in addpatnames])) else: logger.error('No Patients to Add') exit(1) patientslist = [ x for x in vcfspath for y in addpatnames if y in x ] else: patientslist = vcfspath return patientslist def dbbuilder(patientslist, db=None): logger.info('Parsing Patients') pvcfs = ParsedVCF.mp_parser(*patientslist) pvcfs = [ x[[ 'CHROM', 'POS', 'REF', 'ALT', 'ZIGOSITY', 'GENE_NAME', 'HGVS.C', 'HGVS.P' ]] for x in pvcfs ] for df in pvcfs: if 'ZIGOSITY' not in df.columns: df['ZIGOSITY'] = 'UNKWN' pvcfs = [ x.rename(columns={'ZIGOSITY': x.name}) for x in pvcfs if 'ZIGOSITY' in x.columns ] logger.info('Merging parsed patients toDB') if db is not None: db = db.reset_index() pvcfs.insert(0, db) pvcfs = [ x.set_index([ 'CHROM', 'POS', 'REF', 'ALT', 'GENE_NAME', 'HGVS.C', 'HGVS.P' ]) for x in pvcfs ] tempdb1 = pd.concat(pvcfs, axis=1, join='outer') tempdb2 = tempdb1.reset_index().groupby( ['CHROM', 'POS', 'REF', 'ALT']).agg({ 'GENE_NAME': ' | '.join, 'HGVS.P': ' | '.join, 'HGVS.C': ' | '.join }).reset_index() pvcfs2 = [ x.reset_index().drop(columns=['GENE_NAME', 'HGVS.C', 'HGVS.P']) for x in pvcfs ] pvcfs2.insert(0, tempdb2) pvcfs2 = [ x.set_index(['CHROM', 'POS', 'REF', 'ALT']) for x in pvcfs2 ] db = pd.concat(pvcfs2, axis=1, join='outer') colslist = ['GENE_NAME', 'HGVS.C', 'HGVS.P'] for col in colslist: db[col] = db[col].apply( lambda x: ' | '.join(set(x.split(' | ')))) db = db.reset_index().set_index([ 'CHROM', 'POS', 'REF', 'ALT', 'GENE_NAME', 'HGVS.C', 'HGVS.P' ]) db.replace({'.': np.nan}, inplace=True) db = db.pipe(VariantsDB) db = db.calcfreqs() return db try: logger.info('Checking DB File') if variantsDBPath.rsplit('.')[-1].lower() == 'xlsx': db = VariantsDB.from_exceldb( variantsDBPath.rsplit('/', maxsplit=1)[0]) patientslist = patientLister(db) elif variantsDBPath.rsplit('.')[-1].lower() == 'csv': db = VariantsDB.from_csvdb( variantsDBPath.rsplit('/', maxsplit=1)[0]) patientslist = patientLister(db) else: logger.error('VariantsDBPath must be a xlsx or csv file') exit(1) except: exit() logger.info('No DB Found, Building new Variants DB') patientslist = patientLister() db = None sublists = [ patientslist[i:i + int(cfg['GENERAL']['cores'])] for i in range(0, len(patientslist), int(cfg['GENERAL']['cores'])) ] for l in sublists: db = dbbuilder(l, db) return db