def main(): description = """Merge the outputs of multiple SS2 pipeline runs into a single Loom file""" parser = argparse.ArgumentParser(description=description) parser.add_argument( '--input-loom-files', dest='input_loom_files', nargs="+", required=True, help="Path to input loom directory in DirectoryStore format") parser.add_argument('--output-loom-file', dest='output_loom_file', required=True, help="Path to output loom file") parser.add_argument('--plate-sample-id', dest='plate_sample_id', required=True, help="Plate sample id for output loom") args = parser.parse_args() # The list of Loom files that we need to merge loom_file_list = args.input_loom_files attrDict = dict() attrDict['sample_id'] = args.plate_sample_id loompy.combine(loom_file_list, output_file=args.output_loom_file, file_attrs=attrDict)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--bamdir') parser.add_argument('--sample') parser.add_argument('--output') parser.add_argument('-v', dest='verbose', action='store_true') args = parser.parse_args() try: len(args.bamdir) > 0 and len(args.sample) > 0 except: usage() sys.exit(2) if not args.output: args.output = 'VG_CAMA1_D11_ALL' samples = read_sample_file(args.sample) #print(samples) loomfiles = [] for sample in sorted(samples.keys()): loomfile = '%s/%s/velocyto/%s.loom' % (args.bamdir, sample, sample) print(loomfile) ds = loompy.connect(loomfile) print(ds.shape) loomfiles.append(loomfile) loompy.combine(loomfiles, '%s.loom' % (args.output), key="Accession") ds = loompy.connect('%s.loom' % (args.output)) print("Merge loom files") print(ds.shape)
def main(): description = """Merge the outputs of multiple SS2 pipeline runs into a single Loom file""" parser = argparse.ArgumentParser(description=description) parser.add_argument( '--input-loom-files', dest='input_loom_files', nargs="+", required=True, help="Path to input loom directory in DirectoryStore format") parser.add_argument('--output-loom-file', dest='output_loom_file', required=True, help="Path to output loom file") parser.add_argument('--batch_id', dest='batch_id', required=True, help="Batch id for output loom") parser.add_argument('--batch_name', dest='batch_name', help='User provided plate id for output loom') parser.add_argument('--pipeline_version', dest='pipeline_version', required=True, help='Multisample SS2 version') args = parser.parse_args() # The list of Loom files that we need to merge loom_file_list = args.input_loom_files attrDict = dict() attrDict['batch_id'] = args.batch_id attrDict['pipeline_version'] = args.pipeline_version if args.batch_name is not None: attrDict['batch_name'] = args.batch_name loompy.combine(loom_file_list, output_file=args.output_loom_file, file_attrs=attrDict)
import os import loompy from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("-outf", "--output", dest="outf", help="output file", metavar="FILE") parser.add_argument('input', nargs='+', help="input folders") args = parser.parse_args() input = args.input outf = args.outf filelist = [] for p in input: z = os.listdir(p) f = list(filter(lambda x: '.loom' in x, z)) ifi = os.path.join(p, f[0]) filelist.append(ifi) print(filelist) print(outf) loompy.combine(files=filelist, output_file=outf, key="Accession")
import argparse parser = argparse.ArgumentParser( description='select a group name, for example:"UNC-44-Proximal"') parser.add_argument( "ID", help="merge a group of loom files according to the group name") args = parser.parse_args() print(args.ID) import os import loompy path = "/athena/elementolab/scratch/yah2014/Projects/scRNAseq-Lung/data/velocyto" os.chdir(path) # change current path print(os.getcwd()) # List all filer folder's names. file_folders = os.listdir(os.getcwd()) # list files files = [s for s in file_folders if args.ID in s] print(files) # on the command line do: cp file1.loom merged.loom output_filename = args.ID + "_merged.loom" loompy.combine(files, output_filename, key="Accession")
import os import loompy import pandas as pd from openpyxl import load_workbook from openpyxl.utils.dataframe import dataframe_to_rows wb = load_workbook("doc/20210715_scRNAseq_info.xlsx") ws = wb["fastq"] df = pd.DataFrame(ws.values) df.columns = df.loc[0, :] df = df.drop([0], axis=0) df = df[df.Sequence.eq("GEX")] df = df[df.Phase.eq("PALIBR_I")] df = df.sort_values(by=['id']) path = "/athena/elementolab/scratch/yah2014/Projects/scRNAseq-AIM/data/velocyto" file_folders = os.listdir(path) # list files files = [s for s in file_folders if s in df["Sample.id"].ravel() + ".loom"] print(files) # on the command line do: cp file1.loom merged.loom output_filename = "MCL46_merged.loom" os.chdir(path) # change current path print(os.getcwd()) loompy.combine(files, os.path.join(path, output_filename), key="Accession")
def _to_loom(self): """Write a loom file from Redshift query manifests. Returns: output_path: Path to the new loom file. """ # Put loom on the output filename if it's not already there. if not self.local_output_filename.endswith(".zip"): self.local_output_filename += ".zip" loom_filename = self.local_output_filename.rstrip(".zip") # Read the row (gene) attributes and then set some conventional names gene_df = self._load_gene_table() gene_df["featurekey"] = gene_df.index row_attrs = gene_df.to_dict("series") # Not expected to be unique row_attrs["Gene"] = row_attrs.pop("featurename") row_attrs["Accession"] = row_attrs.pop("featurekey") for key, val in row_attrs.items(): row_attrs[key] = val.values loom_parts = [] loom_part_dir = os.path.join(self.working_dir, ".loom_parts") if os.path.exists(loom_part_dir): shutil.rmtree(loom_part_dir) os.makedirs(loom_part_dir) # Iterate over the "slices" produced by the redshift query for slice_idx in range(self._n_slices()): # Get the cell metadata for all the cells in this slice cell_df = self._load_cell_table_slice(slice_idx) # Iterate over fixed-size chunks of expression data from this # slice. chunk_idx = 0 for chunk in self._load_expression_table_slice(slice_idx): print(f"Loading chunk {chunk_idx} from slice {slice_idx}") sparse_cell_dfs = [] # Group the data by cellkey and iterate over each cell grouped = chunk.groupby("cellkey") for cell_group in grouped: single_cell_df = cell_group[1] # Reshape the dataframe so cellkey is a column and features # are rows. Reindex so all dataframes have the same row # order, and then sparsify because this is a very empty # dataset usually. sparse_cell_dfs.append( single_cell_df.pivot( index="featurekey", columns="cellkey", values="exprvalue").reindex( index=row_attrs["Accession"]).to_sparse()) # Concatenate the cell dataframes together. This is what we'll # write to disk. if not sparse_cell_dfs: continue sparse_expression_matrix = pandas.concat(sparse_cell_dfs, axis=1, copy=True) # Get the cell metadata dataframe for just the cell in this # chunk chunk_cell_df = cell_df.reindex( index=sparse_expression_matrix.columns) chunk_cell_df["cellkey"] = chunk_cell_df.index for col in chunk_cell_df.columns: if chunk_cell_df[col].dtype.name == "category": chunk_cell_df[col] = chunk_cell_df[col].astype( "object") col_attrs = chunk_cell_df.to_dict("series") col_attrs["CellID"] = col_attrs.pop("cellkey") # Just a thing you have to do... for key, val in col_attrs.items(): col_attrs[key] = val.values # Write the data from this chunk to its own file. loom_part_path = os.path.join( loom_part_dir, f"matrix.{slice_idx}.{chunk_idx}.loom") print(f"Writing to {loom_part_path}") loompy.create(loom_part_path, sparse_expression_matrix.to_coo(), row_attrs, col_attrs) loom_parts.append(loom_part_path) chunk_idx += 1 # Using the loompy method, combine all the chunks together into a # single file. print(f"Parts complete. Writing to {loom_filename}") loompy.combine(loom_parts, key="Accession", output_file=os.path.join(self.working_dir, loom_filename)) shutil.rmtree(loom_part_dir) zipf = zipfile.ZipFile( os.path.join(self.working_dir, self.local_output_filename), 'w') zipf.write(os.path.join(self.working_dir, loom_filename), arcname=loom_filename) zipf.write("loom_readme.md") zipf.close() return os.path.join(self.working_dir, self.local_output_filename)
import os import sys import loompy if __name__ == '__main__': loomfiles = sys.argv[1:-1] output_filepath = sys.argv[-1] loompy.combine(loomfiles, output_filepath, key='Genes')
import loompy list = ['loom1.loom', 'loom2.loom', 'loom3.loom'] loompy.combine(list, 'Combined.loom', key="Accession")
from sklearn.neighbors import NearestNeighbors import igraph #couldn't load from numpy_groupies import aggregate, aggregate_np #couldn't load import loompy ################################################################################ # Step 0: Combine several loom objects # ################################################################################ files = ["/pub/smorabit/velo/possorted_genome_bam_82GD0.loom", \ "/pub/smorabit/velo/possorted_genome_bam_QKKJQ.loom", \ "/pub/smorabit/velo/possorted_genome_bam_ZRA06.loom", \ "/pub/smorabit/velo/possorted_genome_bam_5AUZJ.loom", \ "/pub/smorabit/velo/possorted_genome_bam_GZ7KV.loom", \ "/pub/smorabit/velo/possorted_genome_bam_DZN3A.loom"] loompy.combine(files, "/pub/smorabit/velo/merged.loom", key="Accession") ################################################################################ # Step 1: Process raw data # ################################################################################ # load merged loom file: ind = vcy.VelocytoLoom("/pub/smorabit/velo/merged.loom") ind_name = "merged" print_dir = "/pub/smorabit/velo/figures/" # %%read in metadata file from Seurat metadata = pd.read_table("/pub/smorabit/velo/Norm.BRCA.Combined.Seurat.Meta.Data.Object.txt") metadata.set_index("barcode", inplace=True) # rename barcodes to match seurat metadata:
import loompy print(snakemake.params.loomfilenames) loompy.combine(snakemake.params.loomfilenames, snakemake.output[0], key="Accession")
import os import loompy import glob #################### # GLOBAL VARIABLES # #################### input_files = snakemake.input #["file1.loom","file2.loom", ... ] output_filename = snakemake.output[0] #loom_files = [str(file) for file in input_files] #input_dir = os.path.dirname(input_file) #loom_files = glob.glob("{}/*.loom".format(input_dir)) #ie ["file1.loom","file2.loom", ... ] #loom_files = input_files #ie ["file1.loom","file2.loom", ... ] #combine loom files loompy.combine(input_files, str(output_filename))
import sys import loompy loompy.combine(sys.argv[1:-1], sys.argv[-1]) # usage: script.py file1.loom file2.loom file3.loom merged.loom
# ------------------------------------------------------------------ # # get list of files from a txt file that contains space separated list of file paths def get_filepaths(file): filepaths = [] with open(file, "r") as f: for line in f: filepaths.extend(line.split()) return (filepaths) # ------------------------------------------------------------------ # # ------------------------------------------------------------------ # if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert STAR mtx to loom') parser.add_argument('--input_files', action="store", dest="input_files", nargs='+') parser.add_argument('--output_file', action="store", dest="output_file") args = parser.parse_args() # -------------------------------------------------------------- # input_files = args.input_files output_filepath = args.output_file loompy.combine(input_files, output_filepath, key='gene_id')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Oct 30 13:45:39 2020 the version of loompy should be the consistent with velocyto; otherwise error returned @author: jingkui.wang """ import loompy import glob files = glob.glob("/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S117008_R9533/LOOMS/*.loom") loompy.combine(files, "/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S117008_R9533/LOOMS/S117008_R9533_merged.loom") files = glob.glob("/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S117007_R9533/LOOMS/*.loom") loompy.combine(files, "/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S117007_R9533/LOOMS/S117007_R9533_merged.loom") files = glob.glob("/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S117009_R9533/LOOMS/*.loom") loompy.combine(files, "/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S117009_R9533/LOOMS/S117009_R9533_merged.loom") # folder S124890_R9968 files = glob.glob("/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S124890_R9968/LOOMS/*.loom") loompy.combine(files, "/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S124890_R9968/LOOMS/S124890_R9968_merged.loom") # folder S124889_R9968 files = glob.glob("/Volumes/groups/cochella/git_aleks_jingkui/scRNAseq_MS_lineage/data/raw_ngs_data/S124889_R9968/LOOMS/*.loom")
def main(argv): print("SEUROCITY v1.0.0, (c) 2020 Richard A. Guyer, MD, PhD\n") # default for rscript_dir to pass to run_rscript function rscript_dir = None input_file = "input.rds" # handle arguments to adjust default settings try: opts, args = getopt.getopt(argv,"hlr:w:") except getopt.GetoptError: arg_error() sys.exit(1) for opt, arg in opts: if opt == '-h': display_help() sys.exit(0) elif opt in ("-l"): display_license() sys.exit(0) elif opt in ("-r"): rscript_dir = arg elif opt in ("-w"): os.chdir(arg) elif opt in ("-i"): input_file = arg working_dir = os.getcwd() + "/" input_dir = working_dir + "inputs/" output_dir = working_dir + "outputs/" # check for files required by extractSeurat.R, run if all are present required_files_for_R = [input_dir + input_file, input_dir + "idents.txt", input_dir + "reductions.txt", input_dir + "append.txt", working_dir + "extractSeurat.R"] if not files_exist(required_files_for_R): print("ERROR: Critical files not found in expected locations") print("Please ensure proper input file structure") print("For help: python Seurocity.py -h") print("") sys.exit(1) else: run_rscript(working_dir + "extractSeurat.R", [input_file,"idents.txt","reductions.txt", "append.txt"], rscript_path=rscript_dir) # get sample IDs and reductions output by Rscript sample_ids = get_ids() reductions = get_reductions() # check whether expected loom files exist expected_looms = [input_dir + ident + ".loom" for ident in sample_ids] if not files_exist(expected_looms): print("ERROR: Expected loom files not found in ./inputs") print("Please ensure proper input file structure") print("For help: python Seurocity.py -h") print("") sys.exit(1) else: print("\nLoading files and processing AnnData objects, this may take a few minutes") samples = load_samples(sample_ids) samples = load_pca_data(samples) samples = import_seur_data(samples, sample_ids, reductions) # ensure every sample has the same list of genes if len(sample_ids) > 1: samples = same_genes(samples, sample_ids) # save main AnnData object for each sample as a loom file comment = "\nSaving main AnnData for each sample as loom files" print(comment) if os.path.exists(output_dir + "proc_loom"): comment = "- WARNING: ./outputs/proc_loom/ exists, files may be overwritten" print(comment) else: os.mkdir(output_dir + "proc_loom") for s in sample_ids: savename = output_dir + "proc_loom/" + s + "_proc.loom" comment = "- Saving sample " + s + " to: " + savename print(comment) samples[s]['main'].varm['PCs'] = np.asarray(samples[s]['main'].varm['PCs']) # currenty is an ArrayView, need to make into numpy array samples[s]['main'].write_loom(savename, write_obsm_varm = True) # remove samples to clean up memory del(samples) # generate combined loom file and import pca data # generate combined file comment = "\nGenerating combined loom file with PCA data loaded" if os.path.exists(output_dir + "comb_loom"): comment = "- WARNING: ./outputs/comb_loom/ exists, combined.loom will be overwritten if it already exists" print(comment) else: os.mkdir(output_dir + "comb_loom") processed_files = os.listdir(output_dir + "proc_loom/") processed_files = [output_dir + "proc_loom/" + p for p in processed_files] lp.combine(processed_files, output_dir + "comb_loom/combined.loom") # load combined loom and pca data files combined = scv.read(output_dir + "comb_loom/combined.loom", cache = False) pca_var = scv.read(output_dir + "seurat_dat/seur_pca_var.csv") pca_load = scv.read(output_dir + "seurat_dat/seur_pca_loadings.csv") # variance data combined.uns['pca'] = {} combined.uns['pca'][pca_var.obs.index[0]] = pca_var.X[0] combined.uns['pca'][pca_var.obs.index[1]] = pca_var.X[1] # pca loadings genes = combined.var.index.tolist() combined.varm['PCs'] = np.asarray(pca_load[genes,:].X) # save combined, now containing pca loadings and variance data combined.write_loom(output_dir + "comb_loom/combined.loom", write_obsm_varm = True)
import os import loompy root = "/projects/nehard/SingleCell/Jupyter/Velocity/" files = [ os.path.join(root, x) for x in [ "./your_path/object1.loom", "./your_path/object2.loom", "/your_path/object3.loom" ] ] loompy.combine(files, "Comb.loom")
def run(self) -> None: # Load metadata metadata: np.ndarray = None meta_attrs: np.ndarray = None metadata_file = os.path.join(am.paths().samples, "metadata", "metadata.xlsx") if os.path.exists(metadata_file): temp = pd.read_excel(metadata_file) meta_attrs = temp.columns.values metadata = temp.values with self.output().temporary_path() as out_file: attrs = {"title": self.tissue} valid_cells = [] sample_files = [s.fn for s in self.input()] for sample in sorted(sample_files): # Connect and perform file-specific cell validation with loompy.connect(sample) as ds: logging.info("Marking invalid cells") (mols, genes) = ds.map([np.sum, np.count_nonzero], axis=1) valid_cells.append( np.logical_and(mols >= 600, (mols / genes) >= 1.2).astype('int')) ds.ca.Total = mols ds.ca.NGenes = genes logging.info("Computing mito/ribo ratio for " + sample) mito = np.where(npstr.startswith(ds.ra.Gene, "mt-"))[0] ribo = np.where(npstr.startswith(ds.ra.Gene, "Rpl"))[0] ribo = np.union1d( ribo, np.where(npstr.startswith(ds.ra.Gene, "Rps"))[0]) if len(ribo) > 0 and len(mito) > 0: mitox = ds[mito, :] ribox = ds[ribo, :] ratio = (mitox.sum(axis=0) + 1) / (ribox.sum(axis=0) + 1) ds.ca.MitoRiboRatio = ratio logging.info("Creating combined loom file") loompy.combine(sample_files, out_file, key="Accession", file_attrs=attrs) # Validating genes logging.info("Marking invalid genes") with loompy.connect(out_file) as ds: vgpath = os.path.join(am.paths().build, "genes.txt") if os.path.exists(vgpath): valids = np.zeros(ds.shape[0]) with open(vgpath, "r") as f: line = f.readline() items = line[:-1].split("\t") valids[np.where(ds.Accession == items[0])] = int( items[1]) ds.set_attr("_Valids", valids, axis=0) else: nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) ds.set_attr("_Valid", valid_genes, axis=0) logging.info("Marking invalid cells") ds.set_attr("_Valid", np.concatenate(valid_cells), axis=1) n_valid = np.sum(ds.col_attrs["_Valid"] == 1) n_total = ds.shape[1] logging.info("%d of %d cells were valid", n_valid, n_total) classifier_path = os.path.join(am.paths().samples, "classified", "classifier.pickle") if os.path.exists(classifier_path): logging.info("Classifying cells by major class") with open(classifier_path, "rb") as f: clf = pickle.load(f) # type: cg.Classifier np.random.seed(13) (classes, probs, class_labels) = clf.predict(ds, probability=True) mapping = { "Astrocyte": "Astrocytes", "Astrocyte,Cycling": "Astrocytes", "Astrocyte,Immune": None, "Astrocyte,Neurons": None, "Astrocyte,Oligos": None, "Astrocyte,Vascular": None, "Bergmann-glia": "Astrocytes", "Blood": "Blood", "Blood,Cycling": "Blood", "Blood,Vascular": None, "Enteric-glia": "PeripheralGlia", "Enteric-glia,Cycling": "PeripheralGlia", "Ependymal": "Ependymal", "Ex-Astrocyte": None, "Ex-Blood": None, "Ex-Immune": None, "Ex-Neurons": None, "Ex-Oligos": None, "Ex-Vascular": None, "Immune": "Immune", "Immune,Neurons": None, "Immune,Oligos": None, "Neurons": "Neurons", "Neurons,Cycling": "Neurons", "Neurons,Immune": None, "Neurons,Oligos": None, "Neurons,Satellite-glia": None, "OEC": "Astrocytes", "Oligos": "Oligos", "Oligos,Cycling": "Oligos", "Oligos,Immune": None, "Oligos,Vascular": None, "Satellite-glia": "PeripheralGlia", "Satellite-glia,Cycling": "PeripheralGlia", "Schwann": "PeripheralGlia", "Schwann,Cycling": "PeripheralGlia", "Satellite-glia,Schwann": None, "Ttr": "Ependymal", "Vascular": "Vascular", "Vascular,Cycling": "Vascular", "Neurons,Vascular": None, "Vascular,Oligos": None, "Satellite-glia,Vascular": None, "Unknown": None, "Outliers": None } classes_pooled = np.array( [str(mapping[c]) for c in classes], dtype=np.object_) # mask invalid cells classes[ds.col_attrs["_Valid"] == 0] = "Excluded" classes_pooled[ds.col_attrs["_Valid"] == 0] = "Excluded" classes_pooled[classes_pooled == "None"] = "Excluded" ds.set_attr("Class", classes_pooled.astype('str'), axis=1) ds.set_attr("Subclass", classes.astype('str'), axis=1) for ix, cls in enumerate(class_labels): ds.set_attr("ClassProbability_" + str(cls), probs[:, ix], axis=1) else: logging.info( "No classifier found in this build directory - skipping." ) ds.set_attr("Class", np.array(["Unknown"] * ds.shape[1]), axis=1) ds.set_attr("Subclass", np.array(["Unknown"] * ds.shape[1]), axis=1)
def merge_looms(loom_paths, output_path): loompy.combine(loom_paths, output_path)