def readMetadata(metaFile): metadata = meta.Metadata(metaFile) plates = metadata.data["Metadata_Plate"].unique() print("Total plates:",len(plates)) for i in range(len(plates)): #plate = metadata.filterRecords(lambda df: (df.Metadata_Plate == plates[0]) & (df.Metadata_Well == "a01"), copy=True) plate = metadata.filterRecords(lambda df: (df.Metadata_Plate == plates[i]), copy=True) yield plate return
def readPlates(metaFile): metadata = meta.Metadata(metaFile) plates = metadata.data["Metadata_Plate"].unique() utils.logger.info("Total plates: " + str(len(plates))) for i in range(len(plates)): plate = metadata.filterRecords(lambda df: (df.Metadata_Plate == plates[i]), copy=True) yield plate return
def readDataset(metaFile, images_dir): # Read metadata and split data in training and validation metadata = meta.Metadata(metaFile, dtype=None) trainingFilter = lambda df: df["Allele_Replicate"] <= 5 validationFilter = lambda df: df["Allele_Replicate"] > 5 metadata.splitMetadata(trainingFilter, validationFilter) # Create a dataset keyGen = lambda r: "{}/{}-{}".format(r["Metadata_Plate"], r[ "Metadata_Well"], r["Metadata_Site"]) dataset = ds.Dataset(metadata, "Allele", CHANNELS, images_dir, keyGen) print(metadata.data.iloc[100]) return dataset
def processMetadata(plate_maps, barcode_file, csv_list, root): # Load plate maps data and create labels plateMaps = meta.Metadata(plate_maps, "multi", "blanks") maps = plateMaps.data maps["Treatment"] = maps["broad_sample"] + "@" + maps[ "mmoles_per_liter"].astype(str) maps["Compound"] = 0 treatments = maps["Treatment"].unique() compounds = maps["broad_sample"].unique() print("Unique treatments:", len(treatments)) for i in range(len(treatments)): maps.loc[lambda df: df.Treatment == treatments[i], "Treatment"] = i utils.printProgress(i + 1, len(treatments), prefix="Treatments") print("Unique compounds:", len(compounds)) for i in range(len(compounds)): maps.loc[lambda df: df.broad_sample == compounds[i], "Compound"] = i utils.printProgress(i + 1, len(compounds), prefix="Compounds") # Load barcodes and csv files barcodes = meta.Metadata(barcode_file, "single") load_data = meta.Metadata(csv_list, "multi") # Merge two frames: csvs + barcodes to attach compound layout to each image columns = list(load_data.data.columns.values) metadata = pd.merge(load_data.data.drop(columns[13:], axis=1), barcodes.data, left_on=["Metadata_Plate"], right_on=["Assay_Plate_Barcode"], how="inner") metadata = metadata.drop( ["Batch_Number", "Batch_Date", "Assay_Plate_Barcode"], axis=1) del load_data, barcodes # Concatenate paths and filenames and make them relative metadata = relativePaths(metadata, "RNA", "PathName_OrigRNA", "FileName_OrigRNA", root) metadata = relativePaths(metadata, "ER", "PathName_OrigER", "FileName_OrigER", root) metadata = relativePaths(metadata, "AGP", "PathName_OrigAGP", "FileName_OrigAGP", root) metadata = relativePaths(metadata, "Mito", "PathName_OrigMito", "FileName_OrigMito", root) metadata = relativePaths(metadata, "DNA", "PathName_OrigDNA", "FileName_OrigDNA", root) print(metadata.info()) # Merge two frames: metadata + plateMaps to attach treatment info to each image metadata = pd.merge(metadata, maps, left_on=["Plate_Map_Name", "Metadata_Well"], right_on=["plate_map_name", "well_position"], how="left") metadata = metadata.drop([ "plate_map_name", "well_position", "broad_sample", "mg_per_ml", "mmoles_per_liter", "solvent" ], axis=1) metadata[ "plate_well"] = metadata["Metadata_Plate"] + metadata["Metadata_Well"] # Find replicate labels metadata["Treatment_Replicate"] = 0 replicateDistribution = {} for i in range(len(treatments)): mask1 = metadata["Treatment"] == i wells = metadata[mask1]["plate_well"].unique() utils.printProgress(i + 1, len(treatments), "Replicates") replicate = 1 for j in range(len(wells)): mask2 = metadata["plate_well"] == wells[j] metadata.loc[mask1 & mask2, "Treatment_Replicate"] = replicate replicate += 1 try: replicateDistribution[replicate - 1] += 1 except: replicateDistribution[replicate - 1] = 1 metadata = metadata.drop(["plate_well"], axis=1) print(replicateDistribution) print(metadata.info()) # Save resulting metadata metadata.to_csv("metadata.csv", index=False) dframe = pd.DataFrame({ "ID": pd.Series(range(len(treatments))), "Treatment": pd.Series(treatments) }) dframe.to_csv("treatments.csv", index=False) dframe = pd.DataFrame({ "ID": pd.Series(range(len(compounds))), "Compound": pd.Series(compounds) }) dframe.to_csv("compounds.csv", index=False)
def processMetadata(plate_maps, barcode_file, csv_list, root): # Load plate maps data and create labels plateMaps = meta.Metadata(plate_maps, "multi", "tabs") maps = plateMaps.data maps["Allele"] = maps["NCBIGeneID"].astype( str) + "@" + maps["x_mutation_status"] maps["Gene"] = 0 alleles = maps["Allele"].unique() genes = maps["NCBIGeneID"].unique() print("Unique alleles:", len(alleles)) for i in range(len(alleles)): maps.loc[lambda df: df.Allele == alleles[i], "Allele"] = i utils.printProgress(i + 1, len(alleles), prefix="Alleles") print("Unique genes:", len(genes)) for i in range(len(genes)): maps.loc[lambda df: df.broad_sample == genes[i], "Gene"] = i utils.printProgress(i + 1, len(genes), prefix="Genes") # Load barcodes and csv files barcodes = meta.Metadata(barcode_file, "single") load_data = meta.Metadata(csv_list, "multi") # Merge two frames: csvs + barcodes to attach gene layout to each image columns = list(load_data.data.columns.values) metadata = pd.merge(load_data.data.drop(columns[13:], axis=1), barcodes.data, left_on=["Metadata_Plate"], right_on=["Assay_Plate_Barcode"], how="inner") del load_data, barcodes # Concatenate paths and filenames and make them relative metadata = relativePaths(metadata, "RNA", "PathName_OrigRNA", "FileName_OrigRNA", root) metadata = relativePaths(metadata, "ER", "PathName_OrigER", "FileName_OrigER", root) metadata = relativePaths(metadata, "AGP", "PathName_OrigAGP", "FileName_OrigAGP", root) metadata = relativePaths(metadata, "Mito", "PathName_OrigMito", "FileName_OrigMito", root) metadata = relativePaths(metadata, "DNA", "PathName_OrigDNA", "FileName_OrigDNA", root) print(metadata.info()) # Merge two frames: metadata + plateMaps to attach treatment info to each image metadata = pd.merge(metadata, maps, left_on=["Plate_Map_Name", "Metadata_Well"], right_on=["plate_map_name", "well_position"], how="left") metadata = metadata.drop([ "plate_map_name", "well_position", "broad_sample", "NCBIGeneID", "pert_type", "PublicID", "Transcript", "VirusPlateName", "well_position", "x_mutation_status", "broad_sample", "pert_name" ], axis=1) metadata["plate_well"] = metadata["Metadata_Plate"].astype( str) + "::" + metadata["Metadata_Well"] # Find replicate labels metadata["Allele_Replicate"] = 0 replicateDistribution = {} for i in range(len(alleles)): mask1 = metadata["Allele"] == i wells = metadata[mask1]["plate_well"].unique() utils.printProgress(i + 1, len(alleles), "Replicates") replicate = 1 for j in range(len(wells)): mask2 = metadata["plate_well"] == wells[j] metadata.loc[mask1 & mask2, "Allele_Replicate"] = replicate replicate += 1 try: replicateDistribution[replicate - 1] += 1 except: replicateDistribution[replicate - 1] = 1 metadata = metadata.drop(["plate_well"], axis=1) print(replicateDistribution) print(metadata.info()) # Save resulting metadata metadata.to_csv("metadata.csv", index=False) dframe = pd.DataFrame({ "ID": pd.Series(range(len(alleles))), "Allele": pd.Series(alleles) }) dframe.to_csv("alleles.csv", index=False) dframe = pd.DataFrame({ "ID": pd.Series(range(len(genes))), "Gene": pd.Series(genes) }) dframe.to_csv("genes.csv", index=False)