def write_specimen_file(specimen_field, specimen_value, project_short_name, output_dir, entry_id, full_file=None): dataset_name = 'FIELD_{}_VALUE_{}'.format(specimen_field, specimen_value).replace( ' ', '_').replace('/', '_') r = send_get('dataset') datasets = json.loads(r.text)['_items'] for dataset in datasets: if dataset['name'] is dataset_name: bioblocks_log( 'Dataset with name \'{}\' already exists, skipping!'.format( dataset_name)) return es_query = generate_es_query(project_short_name, specimen_field, specimen_value) r = session.post( url=HCA_DSS_URL, data=json.dumps(es_query), headers={'Content-type': 'application/json'}, timeout=None, ) time.sleep(5) results = json.loads(r.text)['results'] if len(results) == 0: bioblocks_log( 'Unable to find HCA data for dataset with short name \'{}\' and field:value pair \'{}:{}\'' .format(project_short_name, specimen_field, specimen_value)) return bioblocks_log('Creating specimen dataset \'{}\''.format(dataset_name)) specimen_uuid = str(uuid.uuid4()) create_dataset(specimen_uuid, dataset_name, [entry_id]) output_dir = '{}/{}'.format(output_dir, specimen_uuid) create_directory(output_dir) results_file = '{}/{}_fqids.tsv'.format(output_dir, dataset_name) with open(results_file, 'w') as file: while True: write_bundle_results(results, file, full_file) if 'Link' in r.headers: nextHeader = r.headers['Link'][1:r.headers['Link']. index('&output')] r = session.post(url=nextHeader, data=json.dumps(es_query), headers={'Content-type': 'application/json'}, timeout=None) results = json.loads(r.text)['results'] bioblocks_log('Found {} results'.format(len(results))) else: break bioblocks_log('Finished writing results file \'{}\''.format(results_file))
def analyze_dataset(dataset): dataset_analyses = dataset['analyses'] dataset_id = dataset['_id'] pca_location = '' for analysis in dataset_analyses: if analysis['processType'] == 'TSNE': bioblocks_log( 'T-SNE process already found for dataset \'{}\', skipping'. format(dataset_id)) return elif analysis['processType'] == 'SPRING': pca_location = 'files/datasets/{}/analyses/{}/counts_norm.npz'.format( dataset_id, analysis['_id']) if pca_location == '': bioblocks_log( 'Unable to find counts_norm.npz for dataset \'{}\', skipping T-SNE' .format(dataset_id)) return analysis_id = str(uuid.uuid4()) start_time = datetime.utcnow() bioblocks_log( 'Starting TensorFlow T-SNE analysis \'{}\' for dataset \'{}\''.format( analysis_id, dataset_id)) try: npz_file = portal_spring_upload_functions.load_npz(pca_location) except Exception as e: bioblocks_log(e) return data = portal_spring_helper.get_pca(npz_file, keep_sparse=True, normalize=False) tsne = TSNE(n_jobs=4) Y = tsne.fit_transform(data) end_time = datetime.utcnow() bioblocks_log( 'Finished TensorFlow T-SNE analysis \'{}\' for dataset \'{}\'. Duration: {}' .format(analysis_id, dataset_id, end_time - start_time)) output_dir = create_analysis_directory( 'files/datasets/{}'.format(dataset_id), analysis_id) np.savetxt('{}/tsne_matrix.csv'.format(output_dir), Y, delimiter=',') analysis = { '_id': analysis_id, 'process_type': 'TSNE', 'name': '{} - {}'.format(dataset['name'], 'TSNE') } post_bioblocks_analysis(analysis) patch_analysis_for_dataset(dataset, analysis_id)
def send_get(endpoint, timeout=None): url = bioblocks_api_url.format(endpoint) bioblocks_log('Sending GET to \'{}\''.format(url)) time.sleep(4) r = session.get( url=url, timeout=timeout, verify=verify, ) bioblocks_log('Received status code \'{}\' from GET'.format(r.status_code)) return r
def start_getting_bundles(args): response = session.get(url=HCA_PROJECT_URL, timeout=None) hits = json.loads(response.text)['hits'] output_dir = '{}/files/datasets'.format(path) bioblocks_log('Using output_dir: \'{}\''.format(output_dir)) for hit in hits: entry_id = hit['entryId'] projects = hit['projects'] specimens = hit['specimens'] for project in projects: process_project(project, specimens, entry_id, args, output_dir)
def start_analysis(): dataset_url = 'dataset?embedded={"analyses":1}' r = send_get(dataset_url) if (r.ok is False): bioblocks_log('Error getting dataset analyes: {}'.format(r.text)) else: datasets = json.loads(r.text)['_items'] for dataset in datasets: if 'matrixLocation' in dataset: analyze_dataset(dataset) else: bioblocks_log('Dataset {} has no matrix'.format( dataset['_id']))
def create_dataset(_id, name, derived_from): r = send_post( 'dataset', json.dumps({ '_id': _id, 'derivedFrom': derived_from, 'name': name, })) if (r.ok): bioblocks_log('Successfully created dataset \'{}\''.format(_id)) else: bioblocks_log('Error creating dataset \'{}\': \'{}\''.format( _id, r.text))
def send_delete(endpoint, headers, timeout=None): url = bioblocks_api_url.format(endpoint) time.sleep(4) bioblocks_log('Sending DELETE to: \'{}\''.format(url)) r = session.delete( url=url, headers=headers, timeout=timeout, verify=verify, ) bioblocks_log('Received status code \'{}\' from DELETE'.format( r.status_code)) return r
def send_patch(endpoint, data, headers, timeout=None): url = bioblocks_api_url.format(endpoint) bioblocks_log('Sending PATCH to \'{}\''.format(url)) time.sleep(4) r = session.patch( url=url, data=data, headers=headers, timeout=timeout, verify=verify, ) bioblocks_log('Received status code \'{}\' from PATCH'.format( r.status_code)) return r
def patch_dataset_for_job(request_id, result_location, associated_dataset): r = send_patch('{}/{}'.format('dataset', associated_dataset['dataset']), json.dumps({ 'matrixLocation': result_location, }), { 'Content-type': 'application/json', 'If-Match': associated_dataset['etag'] }) if (r.ok): bioblocks_log('Successfully patched job \'{}\''.format(request_id)) return json.loads(r.text)['_etag'] else: bioblocks_log('Error patching job \'{}\': \'{}\''.format( request_id, r.text))
def send_post(endpoint, data, headers={'Content-type': 'application/json'}, timeout=None): url = bioblocks_api_url.format(endpoint) bioblocks_log('Sending POST to \'{}\''.format(url)) time.sleep(4) r = session.post( url=url, data=data, headers=headers, timeout=timeout, verify=verify, ) bioblocks_log('Received status code \'{}\' from POST'.format( r.status_code)) return r
def create_bioblocks_job(request_id, dataset): r = send_post( 'job', json.dumps({ '_id': request_id, 'associatedDataset': { 'dataset': dataset['_id'], 'etag': dataset['_etag'] }, 'link': '{}/{}'.format(MATRIX_SERVICE_V0_URL, request_id), 'status': 'IN_PROGRESS' })) if (r.ok): bioblocks_log('Successfully created job \'{}\''.format(request_id)) else: bioblocks_log('Error creating job \'{}\': \'{}\''.format( request_id, r.text))
def derive_es_query_match_field(match_key): match_dict = { 'id': 'files.specimen_from_organism_json.biomaterial_core.biomaterial_id', 'genusSpecies': 'files.cell_suspension_json.genus_species.text', 'organ': 'files.specimen_from_organism_json.organ.text', 'organPart': 'files.specimen_from_organism_json.organ_part.text', 'organismAge': 'files.donor_organism_json.organism_age', 'biologicalSex': 'files.donor_organism_json.sex', 'disease': 'files.donor_organism_json.diseases.text', 'preservationMethod': 'files.specimen_from_organism_json.preservation_storage.preservation_method', 'source': 'files.library_preparation_protocol_json.nucleic_acid_source' } if match_key in match_dict: return match_dict[match_key] else: bioblocks_log('Unhandled specimen field \'{}\'!'.format(match_key)) return match_key
def send_hca_matrix_job_request(dataset): """ V0 bundle_fqids_url = '{}/datasets/{}/{}_fqids.tsv'.format(TSV_PUBLIC_URL, dataset['_id'], dataset['name']) bioblocks_log('POST-ing matrix job with bundle_fqid_url=\'{}\''.format(bundle_fqids_url)) r = session.post( url=MATRIX_SERVICE_V0_URL, data=json.dumps({ 'bundle_fqids_url': bundle_fqids_url, 'format': 'mtx', }), headers={'Content-type': 'application/json'}, timeout=None ) """ r = session.post(url=MATRIX_SERVICE_V1_URL, data=json.dumps({ 'fields': ['specimen_from_organism.provenance.document_id'], 'filter': { 'op': '=', 'value': dataset['name'], 'field': 'project.project_core.project_short_name' }, 'format': 'mtx', }), headers={'Content-type': 'application/json'}, timeout=None) bioblocks_log('Returned status from matrix service: {}'.format( r.status_code)) if (r.ok): result = json.loads(r.text) request_id = result['request_id'] associated_dataset = { 'dataset': dataset['_id'], 'etag': dataset['_etag'] } if result['status'] == 'In Progress': dataset['_etag'] = patch_dataset_for_job( request_id, 'IN_PROGRESS - CHECK JOB {}'.format(request_id), associated_dataset) create_bioblocks_job(request_id, dataset) elif result['status'] == 'Failed': bioblocks_log('Matrix job failed with message: \'{}\''.format( result['message'])) else: bioblocks_log('Unhandled status \'{}\' from matrix service'.format( result['status'])) else: bioblocks_log(r.text)
def patch_matrix_info_for_dataset(dataset, mtx_info, matrix_location=None): dataset_id = dataset['_id'] dataset_etag = dataset['_etag'] if matrix_location is None: matrix_location = dataset['matrixLocation'] r = send_patch('{}/{}'.format('dataset', dataset_id), json.dumps({ 'matrixInfo': { 'colCount': mtx_info[1], 'rowCount': mtx_info[0] }, 'matrixLocation': matrix_location, }), {'Content-type': 'application/json', 'If-Match': dataset_etag}) bioblocks_log('Returned status from dataset PATCH of matrix info: {}'.format(r.status_code)) if r.ok is False: bioblocks_log(r.text) return dataset_etag else: return json.loads(r.text)['_etag']
def delete_directory(old_dir): bioblocks_log('Deleting directory \'{}\''.format(old_dir)) try: shutil.rmtree(old_dir) bioblocks_log('Deleted directory \'{}\''.format(old_dir)) except Exception: bioblocks_log( 'Directory \'{}\' doesn\'t exist, skipping!'.format(old_dir))
def create_directory(new_dir): bioblocks_log('Creating directory \'{}\''.format(new_dir)) try: os.mkdir(new_dir) bioblocks_log('Created new directory \'{}\''.format(new_dir)) except Exception: bioblocks_log( 'Directory \'{}\' already exists, skipping!'.format(new_dir))
def check_bioblocks_jobs(): """ Determine if each job has either finished or taken too long to finish. """ r = send_get('job') jobs = json.loads(r.text)['_items'] for job in jobs: try: handle_hca_matrix_job_status(job) job_id = job['_id'] started = job['_created'] job_age_days = days_between_dates( datetime.datetime.utcnow(), datetime.datetime.strptime(started, RFC_1123_FORMAT)) if job_age_days >= MAX_DAYS_JOB_KEEP_ALIVE: delete_bioblocks_job(job) else: bioblocks_log( 'Not deleting job \'{}\', less than {} days old'.format( job_id, MAX_DAYS_JOB_KEEP_ALIVE)) except Exception as e: bioblocks_log('Exception checking job id \'{}\': {}'.format( job_id, e))
def create_analysis_directory(dataset_dir, analysis_id): try: analysis_dir = '{}/analyses'.format(dataset_dir) os.mkdir(analysis_dir) bioblocks_log('Created new directory \'{}\''.format(analysis_dir)) except Exception: bioblocks_log( 'Analyses directory \'{}\' already exists, skipping!'.format( analysis_dir)) try: output_dir = '{}/{}'.format(analysis_dir, analysis_id) os.mkdir(output_dir) bioblocks_log('Created new directory \'{}\''.format(output_dir)) except Exception: bioblocks_log( 'Analyses directory \'{}\' already exists, skipping!'.format( output_dir)) finally: return output_dir
def process_project(project, specimens, entry_id, args, output_dir): short_name = project['projectShortname'] if (args.is_dry_run and (short_name == 'HumanMousePancreas')) or \ not args.is_dry_run: bioblocks_log( 'Getting bundles for project with shortname \'{}\''.format( short_name)) create_directory('{}/{}'.format(output_dir, entry_id)) create_dataset(entry_id, short_name, []) return with open('{}/{}/full_fqids.tsv'.format(output_dir, entry_id), 'w') as full_file: for specimen in specimens: for specimen_field in specimen: for specimen_value in specimen[specimen_field]: if specimen_value is None: bioblocks_log( 'Specimen field \'{}\' with value of \'none\''. format(specimen_field)) elif specimen_field == 'id': write_specimen_file(specimen_field, specimen_value, short_name, output_dir, entry_id, full_file) """ elif not args.is_dry_run: write_specimen_file( specimen_field, specimen_value, short_name, output_dir, entry_id) """ else: bioblocks_log( 'Skipping field \'{}\' with value \'{}\''. format(specimen_field, specimen_value)) else: bioblocks_log( 'Skipping getting bundles for project with shortname \'{}\'.'. format(short_name))
def patch_analysis_for_dataset(dataset, analysis_id): dataset_id = dataset['_id'] dataset_etag = dataset['_etag'] dataset_analyses = dataset['analyses'] dataset_analyses.append(analysis_id) bioblocks_log('PATCHing dataset \'{}\' with analysis \'{}\''.format(dataset_id, analysis_id)) r = send_patch('{}/{}'.format('dataset', dataset_id), json.dumps({ 'analyses': dataset_analyses, }), {'Content-type': 'application/json', 'If-Match': dataset_etag}) bioblocks_log('Returned status from dataset PATCH of analysis: {}'.format(r.status_code)) if r.ok is False: bioblocks_log(r.text) return dataset_etag else: return json.loads(r.text)['_etag']
def analyze_dataset(dataset, dataset_dir, MAX_CELLS_COUNT=200000): dataset_analyses = dataset['analyses'] dataset_id = dataset['_id'] matrix_location = dataset['matrixLocation'] # dataset_id = 'f83165c5-e2ea-4d15-a5cf-33f3550bffde' # matrix_location = # 'https://s3.amazonaws.com/dcp-matrix-service-results-prod/c34ccb0e-e4fa-4c08-8d76-84aca71dfc99.mtx.zip' bioblocks_log('analyzing dataset directory \'{}\' with matrix_location \'{}\''.format(dataset_dir, matrix_location)) ends_with_zip = matrix_location.endswith('.zip') ends_with_mtx = matrix_location.endswith('mtx') ends_with_mtx_gz = matrix_location.endswith('mtx.gz') if ( ends_with_zip is False and ends_with_mtx is False and ends_with_mtx_gz is False ): bioblocks_log('Dataset \'{}\' has a invalid matrix location: {}, not running SPRING.'.format( dataset_id, matrix_location)) return for analysis in dataset_analyses: if analysis['processType'] == 'SPRING': bioblocks_log('SPRING process already found for dataset \'{}\', skipping'.format( dataset_id)) return if (ends_with_zip is True or ends_with_mtx is True): zip_request = session.get(matrix_location) zip_location = zip_request.content else: zip_location = matrix_location # Unzip the matrix and write it to the local file system. tmp_dir = '' with zipfile.ZipFile(io.BytesIO(zip_location)) as z: z.extractall(dataset_dir) for name in z.namelist(): tmp_dir = name.split('/')[0] final_file_name = name.split('/', 1)[-1][:-3] full_path = '{}/{}'.format(dataset_dir, name) create_directory('{}/matrix'.format(dataset_dir)) with gzip.open(full_path, 'rb') as f_in: with open('{}/matrix/{}'.format(dataset_dir, final_file_name), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) run_spring_analysis(dataset_dir=dataset_dir, dataset_id=dataset_id, dataset=dataset, MAX_CELLS_COUNT=MAX_CELLS_COUNT, tmp_dir=tmp_dir)
def create_hca_matrix_jobs(): r = send_get('dataset') if (r.ok): datasets = json.loads(r.text)['_items'] for dataset in datasets: if 'matrixLocation' in dataset: bioblocks_log( 'Not creating job for dataset \'{}\', it already has a matrix field' .format(dataset['_id'])) else: bioblocks_log('Creating job for dataset \'{}\'.'.format( dataset['_id'])) send_hca_matrix_job_request(dataset) else: bioblocks_log('Unable to get datasets from bioblocks: {}'.format( r.text))
def post_bioblocks_analysis(analysis): bioblocks_log('Creating analysis with analysis_id=\'{}\''.format( analysis['_id'])) r = send_post( 'analysis', json.dumps({ '_id': analysis['_id'], 'name': analysis['name'], 'processType': analysis['process_type'], }), {'Content-type': 'application/json'}) bioblocks_log( 'Returned status from bioblocks analysis request: \'{}\''.format( r.status_code)) if r.ok is False: bioblocks_log(r.text) else: return json.loads(r.text)['_etag']
def start_analysis(): dataset_url = 'dataset?embedded={"analyses":1}' r = send_get(dataset_url) if (r.ok is False): bioblocks_log('Error getting dataset analyes: {}'.format(r.text)) else: datasets = json.loads(r.text)['_items'] for dataset in datasets: dataset_id = dataset['_id'] dataset_dir = 'files/datasets/{}'.format(dataset_id) if 'matrixLocation' in dataset: try: analyze_dataset(dataset, dataset_dir) except Exception as e: bioblocks_log('Exception found running SPRING: {}'.format(e)) traceback.print_exc() else: bioblocks_log('Dataset {} has no matrix, not running SPRING.'.format(dataset_id))
def create_subsampled_matrix(large_matrix_location, gene_file=None, MAX_CELLS_COUNT=MAX_CELLS_COUNT): mtx_info = mminfo(large_matrix_location) num_rows = mtx_info[0] num_cols = mtx_info[1] num_cells = num_cols if gene_file is not None: gene_list = load_genes( gene_file, delimiter='\t' if gene_file.endswith('tsv') else None, skip_rows=1 if gene_file.endswith('tsv') else 0) if num_rows == len(gene_list): bioblocks_log('Genes are rows, Cells are cols') num_cells = num_cols sample_rows = False else: bioblocks_log('Cells are rows, Genes are cols') num_cells = num_rows sample_rows = True else: gene_list = [''] subsampled_indices = random.sample(range(1, num_cells + 1), MAX_CELLS_COUNT) subsampled_indices.sort() if (num_cells > MAX_CELLS_COUNT): subsampled_matrix_location = '{}_sub.mtx'.format( large_matrix_location[0:len(large_matrix_location) - 4]) with open(subsampled_matrix_location, 'w') as subsampled_matrix: with open(large_matrix_location) as large_matrix: matrix_header = large_matrix.readline() num_entries = 0 output_lines = [] line = large_matrix.readline() while line: line = large_matrix.readline() matrix_index = -1 if sample_rows is True and line.split(' ')[0].isnumeric(): matrix_index = int(line.split(' ')[0]) num_entries += 1 elif len(line.split(' ')) >= 2 and line.split( ' ')[1].isnumeric(): matrix_index = int(line.split(' ')[1]) num_entries += 1 bisect_index = bisect_left(subsampled_indices, matrix_index) if bisect_index != len( subsampled_indices ) and subsampled_indices[bisect_index] == matrix_index: output_lines.append(line) if sample_rows is True: matrix_header = '{}{}'.format( '{} {} {}'.format(MAX_CELLS_COUNT, len(gene_list), num_entries), matrix_header) else: matrix_header = '{}{}'.format( matrix_header, '{} {} {}'.format(len(gene_list), MAX_CELLS_COUNT, num_entries)) subsampled_matrix.write(matrix_header) for line in output_lines: subsampled_matrix.write(line) os.remove(large_matrix_location) os.rename(subsampled_matrix_location, large_matrix_location)
def handle_hca_matrix_job_status(job): """ Process a matrix job which can either succeed, fail, or still be in progress. """ r = session.get(url=job['link'], timeout=None) job_id = job['_id'] if r.ok is False: bioblocks_log('Job \'{}\' link returned error: {}'.format( job_id, r.text)) else: result = json.loads(r.text) if result['status'] == 'In Progress': bioblocks_log('IN PROGRESS') bioblocks_log('Job \'{}\' is still in progress'.format(job_id)) elif result['status'] == 'Failed': bioblocks_log('FAILED') bioblocks_log('Job \'{}\' failed with message \'{}\''.format( job_id, result['message'])) delete_bioblocks_job(job) elif result['status'] == 'Complete': bioblocks_log('COMPLETE') matrix_location = result['matrix_location'] bioblocks_log( 'Job \'{}\' is complete! Storing matrix location of \'{}\''. format(job_id, matrix_location)) patch_dataset_for_job(job_id, result['matrix_location'], job['associatedDataset']) delete_bioblocks_job(job) else: bioblocks_log('Unhandled status \'{}\' with message \'{}\''.format( result['status'], result['message']))
def make_spring_subplot(E, gene_list, save_path, base_ix=None, normalize=True, exclude_dominant_frac=1.0, min_counts=3, min_cells=5, min_vscore_pctl=75, show_vscore_plot=False, exclude_gene_names=None, num_pc=30, sparse_pca=False, pca_norm=True, k_neigh=4, cell_groupings={}, num_force_iter=100, output_spring=True, precomputed_pca=None, gene_filter=None, custom_colors={}, exclude_corr_genes_list=None, exclude_corr_genes_minCorr=0.2, dist_metric='euclidean', use_approxnn=False, run_doub_detector=False, dd_k=50, dd_frac=5, dd_approx=True, tot_counts_final=None): out = {} E = E.tocsc() if base_ix is None: base_ix = np.arange(E.shape[0]) # total counts normalize if tot_counts_final is None: tot_counts_final = E.sum(1).A.squeeze() out['tot_counts_final'] = tot_counts_final if normalize: # print 'Normalizing' E = tot_counts_norm(E, exclude_dominant_frac=exclude_dominant_frac)[0] if precomputed_pca is None: if gene_filter is None: # Get gene stats (above Poisson noise, i.e. V-scores) # print 'Filtering genes' if (min_counts > 0) or (min_cells > 0) or (min_vscore_pctl > 0): gene_filter = filter_genes(E, base_ix, min_vscore_pctl=min_vscore_pctl, min_counts=min_counts, min_cells=min_cells, show_vscore_plot=show_vscore_plot) else: gene_filter = np.arange(E.shape[1]) if len(gene_filter) == 0: return 'Error: No genes passed filter' # print 'Error: All genes have mean expression < '+repr(min_exp) + ' or CV < '+repr(min_cv) # print 'Using %i genes' %(len(gene_filter)) if not exclude_corr_genes_list is None: gene_filter = remove_corr_genes( E, gene_list, exclude_corr_genes_list, gene_filter, min_corr=exclude_corr_genes_minCorr) if len(gene_filter) == 0: return 'Error: No genes passed filter' # Remove user-excluded genes from consideration if not exclude_gene_names is None: keep_ix = np.array([ ii for ii, gix in enumerate(gene_filter) if gene_list[gix] not in exclude_gene_names ]) # print 'Excluded %i user-provided genes' %(len(gene_filter)-len(keep_ix)) gene_filter = gene_filter[keep_ix] if len(gene_filter) == 0: return 'Error: No genes passed filter' out['gene_filter'] = gene_filter # RUN PCA # if method == 'sparse': normalize by stdev # if method == anything else: z-score normalize # print 'Running PCA' num_pc = min(len(gene_filter), num_pc) bioblocks_log('num_pc: {}'.format(num_pc)) out['num_pc'] = num_pc Epca = get_pca(E[:, gene_filter], base_ix=base_ix, numpc=num_pc, keep_sparse=sparse_pca, normalize=pca_norm) out['Epca'] = Epca # else: # print 'Using user-supplied PCA coordinates' # Epca = precomputed_pca # print 'Building kNN graph' links, knn_graph = get_knn_graph(Epca, k=k_neigh, dist_metric=dist_metric, approx=use_approxnn) out['knn_graph'] = knn_graph if run_doub_detector: import doublet_detector as woublet # print 'Running woublet' doub_score, doub_score_full, doub_labels = woublet.detect_doublets( [], counts=tot_counts_final, doub_frac=dd_frac, k=dd_k, use_approxnn=dd_approx, precomputed_pca=Epca) out['doub_score'] = doub_score out['doub_score_sim'] = doub_score_sim if output_spring: if not os.path.exists(save_path): os.makedirs(save_path) # print 'Saving SPRING files to %s' %save_path custom_colors['Total Counts'] = tot_counts_final np.savez_compressed(save_path + '/intermediates.npz', Epca=Epca, gene_filter=gene_filter, total_counts=tot_counts_final) bioblocks_log('--- SAVING PCA ---') np.savetxt('{}/pca.csv'.format(save_path), Epca, delimiter=',', fmt='%.3f') bioblocks_log('--- FINISHED SAVING PCA ---') if run_doub_detector: custom_colors['Doublet Score'] = doub_score if len(cell_groupings) > 0: save_spring_dir_sparse_hdf5(E, gene_list, save_path, list(links), custom_colors=custom_colors, cell_groupings=cell_groupings) else: save_spring_dir_sparse_hdf5(E, gene_list, save_path, list(links), custom_colors=custom_colors) if num_force_iter > 0: positions = get_force_layout(links, Epca.shape[0], n_iter=num_force_iter, edgeWeightInfluence=1, barnesHutTheta=2, scalingRatio=1, gravity=0.05, jitterTolerance=1, verbose=False) positions = positions / 5.0 positions = positions - \ np.min(positions, axis=0) - np.ptp(positions, axis=0) / 2.0 positions[:, 0] = positions[:, 0] + 750 positions[:, 1] = positions[:, 1] + 250 out['coordinates'] = positions if output_spring: if num_force_iter > 0: np.savetxt(save_path + '/coordinates.txt', np.hstack( (np.arange(positions.shape[0])[:, None], positions)), fmt='%i,%.5f,%.5f') info_dict = {} info_dict['Date'] = '%s' % datetime.now() info_dict['Nodes'] = Epca.shape[0] info_dict['Filtered_Genes'] = len(gene_filter) info_dict['Gene_Var_Pctl'] = min_vscore_pctl info_dict['Min_Cells'] = min_cells info_dict['Min_Counts'] = min_counts info_dict['Num_Neighbors'] = k_neigh info_dict['Num_PCs'] = num_pc info_dict['Num_Force_Iter'] = num_force_iter with open(save_path + '/run_info.json', 'w') as f: f.write(json.dumps(info_dict, indent=4, sort_keys=True)) return out
def run_spring_preprocessing( # Input files mtx_file='matrix.mtx', gene_file='gene_id.csv', cell_labels_file=None, custom_colors_file=None, # Main dataset directory (for storing counts matrices, gene names, subplots) main_dir='example_dataset', # Subplot directory (all cells used for first subplot; subsets of cells # can be used to make additional subplots from within SPRING viewer) subplot_name='all_cells', # Get pre-processing parameters. # On our server, these are specified by the user. # For now, I've just hard-coded some (probably) # reasonable defaults. cell_min_counts=0, gene_min_cells=3, gene_min_counts=3, gene_var_pctl=85, n_neighbors=5, n_prin_comps=30, n_force_iter=500, subsample_range=[], sample_rows=False, num_cells=0 ): #========================================================================================# # Initialize some stuff cell_labels = {} custom_color_tracks = {} gene_sets = {} subplot_dir = '{}/{}'.format(main_dir, subplot_name) bioblocks_log('SPRING parameters: \n\ \'mtx_file\': {}\n\ \'gene_file\': {}\n\ \'cell_labels_file\': {}\n\ \'main_dir\': {}\n\ \'subplot_name\': {}\n\ \'subplot_dir\': {}' .format(mtx_file, gene_file, cell_labels_file, main_dir, subplot_dir, subplot_dir)) #========================================================================================# # LOAD DATA # Load expression matrix - supporting mtx files, but I also have code for # many other formats. Let me know if you want something more flexible. mtx_info = mminfo(mtx_file) num_rows = mtx_info[0] gene_list = load_genes(gene_file, delimiter='\t' if gene_file.endswith('tsv') else None, skip_rows=1 if gene_file.endswith('tsv') else 0) E = run_matrix_sampling(num_cells=num_cells, mtx_file=mtx_file, sample_size=subsample_range, sample_rows=sample_rows) bioblocks_log('E shape: {}'.format(E.shape)) # Find dimension of counts matrix that matches number of genes. # If necessary, transpose counts matrix with # rows=cells and columns=genes. if num_rows == len(gene_list): E = E.T.tocsc() valid_gene_mask = get_valid_gene_mask(gene_list) gene_list = [g for iG, g in enumerate(gene_list) if valid_gene_mask[iG]] E = E[:, valid_gene_mask] # Load cell_labels (categorical variables) if file has been specified # if cell_labels_file is not None: # cell_labels = load_custom_data(cell_labels_file) # Load custom_colors (continuous variables) if file has been specified # if custom_colors_file is not None: # custom_color_tracks = load_custom_data(custom_colors_file) #========================================================================================# # PROCESS DATA # Make main directory if not os.path.exists(main_dir): os.makedirs(main_dir) # Perform cell filtering, removing cells with less than minimum # number of total counts total_counts = E.sum(1).A.squeeze() if cell_min_counts > 0: cell_filter = (total_counts >= cell_min_counts) else: cell_filter = (total_counts > 0) total_counts = total_counts[cell_filter] # Save cell filter np.save('{}/cell_filter_mask.npy'.format(main_dir), cell_filter) # Save gene list np.savetxt('{}/genes.txt'.format(main_dir), gene_list, fmt='%s') # Calculate stress signature: fraction of counts from mitochondrially # encoded genes (genes starting with "mt-" or "MT-") mito_ix = np.array([iG for iG, g in enumerate(gene_list) if g.startswith('mt-') or g.startswith('MT-')], dtype=int) if len(mito_ix) > 0: mito_frac = E[:, mito_ix][cell_filter, :].sum( 1).A.squeeze() / total_counts.astype(float) custom_color_tracks['Mito. frac.'] = mito_frac # Normalize counts matrix E = tot_counts_norm(E[cell_filter, :])[0] # E = tot_counts_norm(E[0])[0] # Save counts matrix as hdf5 files for fast loading in SPRING save_hdf5_genes( E, gene_list, '{}/counts_norm_sparse_genes.hdf5'.format(main_dir)) save_hdf5_cells(E, '{}/counts_norm_sparse_cells.hdf5'.format(main_dir)) save_sparse_npz(E, '{}/counts_norm.npz'.format(main_dir)) # Save total counts per cell np.savetxt('{}/total_counts.txt'.format(main_dir), total_counts) # Set default cell label - same for all cells cell_labels['Default'] = ['All cells' for i in range(E.shape[0])] # Calculate gene set signatures if gene sets are provided if len(gene_sets) > 0: for kk, vv in gene_sets.items(): custom_color_tracks[kk] = average_profile(E, gene_list, vv) # Use Truncated SVD and approximate nearest neighbors if >100,000 cells if E.shape[0] < 100000: sparse_pca = False use_approxnn = False else: sparse_pca = True use_approxnn = True # Run SPRING pre-processing out = make_spring_subplot( E, gene_list, subplot_dir, normalize=False, min_counts=gene_min_counts, min_cells=gene_min_cells, min_vscore_pctl=gene_var_pctl, num_pc=n_prin_comps, sparse_pca=sparse_pca, k_neigh=n_neighbors, cell_groupings=cell_labels, num_force_iter=n_force_iter, custom_colors=custom_color_tracks, use_approxnn=use_approxnn, tot_counts_final=total_counts ) # Save pre-processing parameters new_params = { 'min_reads': cell_min_counts, 'min_cells': gene_min_cells, 'min_counts': gene_min_counts, 'vscore_pctl': gene_var_pctl, 'k': n_neighbors, 'p': n_prin_comps } print('{}/params.p'.format(subplot_dir)) pickle.dump(new_params, open('{}/params.p'.format(subplot_dir), 'wb')) # Save cell filter files np.savetxt('{}/cell_filter.txt'.format(subplot_dir), np.arange(E.shape[0]), fmt='%i') np.save('{}/cell_filter.npy'.format(subplot_dir), np.arange(E.shape[0]))
def run_spring_analysis(dataset_dir, dataset_id, dataset, MAX_CELLS_COUNT, tmp_dir): mtx_file = '{}/matrix/matrix.mtx'.format(dataset_dir) gene_file = '{}/matrix/genes.tsv'.format(dataset_dir) bioblocks_log('mtx_file = {}'.format(mtx_file)) mtx_info = mminfo(mtx_file) num_rows = mtx_info[0] num_cols = mtx_info[1] bioblocks_log(mtx_info) gene_list = load_genes(gene_file, delimiter='\t' if gene_file.endswith('tsv') else None, skip_rows=1 if gene_file.endswith('tsv') else 0) if num_rows == len(gene_list): bioblocks_log('Genes are rows, Cells are cols') num_cells = num_cols sample_rows = False else: bioblocks_log('Cells are rows, Genes are cols') num_cells = num_rows sample_rows = True if num_cells > MAX_CELLS_COUNT: bioblocks_log('mtx_file: {}'.format(mtx_file)) create_subsampled_matrix(mtx_file, gene_file, MAX_CELLS_COUNT) num_cells = MAX_CELLS_COUNT subsample_ranges = get_cell_subsample_ranges(num_cells) bioblocks_log('Attempting to run SPRING with subsample ranges {}'.format(subsample_ranges)) for subsample_range in subsample_ranges: analysis_id = str(uuid.uuid4()) start_time = datetime.utcnow() bioblocks_log('Starting SPRING analysis \'{}\' for dataset \'{}\''.format( analysis_id, dataset_id)) main_dir = '{}/analyses/{}'.format(dataset_dir, analysis_id) spring_load_preprocess.run_spring_preprocessing( mtx_file=mtx_file, gene_file=gene_file, cell_labels_file='{}/matrix/cells.tsv'.format( dataset_dir), main_dir=main_dir, subplot_name=dataset['name'], sample_rows=sample_rows, subsample_range=subsample_range, num_cells=num_cells ) try: dataset['_etag'] = patch_matrix_info_for_dataset(dataset, mtx_info, mtx_file) analysis = { '_id': analysis_id, 'process_type': 'SPRING', 'name': '{} - {}'.format(dataset['name'], get_numeric_shorthand_suffix(subsample_range)) } post_bioblocks_analysis(analysis) dataset['_etag'] = patch_analysis_for_dataset(dataset, analysis_id) except Exception as e: bioblocks_log('Error with compression of matrix file: {}'.format(e)) return try: # bioblocks_log('Compressing file: {}'.format(mtx_file)) # with open(mtx_file, 'rb') as f_in: # with gzip.open('{}.gz'.format(mtx_file), 'wb') as f_out: # shutil.copyfileobj(f_in, f_out) # bioblocks_log('Finished compressing file: {}'.format(mtx_file)) delete_directory('{}/{}'.format(dataset_dir, tmp_dir)) os.remove(mtx_file) except Exception as e: bioblocks_log('Error with cleanup of matrix file: {}'.format(e)) end_time = datetime.utcnow() bioblocks_log('Finished SPRING analysis \'{}\' for dataset \'{}\'. Duration: {}'.format( analysis_id, dataset_id, end_time - start_time))
def delete_file(old_file): try: os.rm(old_file) bioblocks_log('Deleted file \'{}\''.format(old_file)) except Exception: bioblocks_log('File \'{}\' doesn\'t exist, skipping!'.format(old_file))