def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-id', '--dataset_id', type=str, required=True, help='ID of the dataset to update') args = parser.parse_args() comments = htmlmin.minify(open(args.input_file).read(), remove_empty_space=True) cnx = geardb.Connection() cursor = cnx.get_cursor() qry = "UPDATE dataset SET ldesc = %s WHERE id = %s" cursor.execute(qry, (comments, args.dataset_id)) cursor.close() cnx.commit() print("Finished.\n Dataset updated.")
def main(): cnx = geardb.Connection() cursor = cnx.get_cursor() marked_dataset_ids = list() query = """ SELECT id FROM layout WHERE share_id is NULL """ cursor.execute(query, ) ids = list() for row in cursor: ids.append(row[0]) for id in ids: # Get just the first block of the UUID share_id = str(uuid.uuid4()).split('-')[0] print(share_id) qry = """ UPDATE layout SET share_id = %s WHERE id = %s """ cursor.execute(qry, ( share_id, id, )) cnx.commit()
def main(): cnx = geardb.Connection() cursor = cnx.get_cursor() # perhaps add a basic SELECT query here? cursor.close() cnx.close()
def main(): cnx = geardb.Connection() cursor = cnx.get_cursor() dataset_ids = get_dataset_id_list(cursor) for dataset_id in dataset_ids: print("Processing dataset ID: {0}".format(dataset_id)) dataset = geardb.Dataset(id=dataset_id, has_h5ad=1) h5ad_path = dataset.get_file_path() if os.path.exists(h5ad_path): print("\tFile found: {0}".format(h5ad_path)) else: print("\tFile not found, skipping: {0}".format(h5ad_path)) continue violin_image_path = h5ad_path.replace('.h5ad', '.prelim_violin.png') scatter_image_path = h5ad_path.replace('.h5ad', '.prelim_n_genes.png') # Skip if the images are already there if os.path.exists(violin_image_path) and os.path.exists(scatter_image_path): print("\tImages already found, skipping") continue ana = geardb.Analysis(dataset_id=dataset_id, type='primary') adata = ana.get_adata() print("\tGenerating figures") # the ".A1" is only necessary, as X is sparse - it transform to a dense array after summing try: # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1 except AttributeError: # add the total counts per cell as observations-annotation to adata adata.obs['n_counts'] = np.sum(adata.X, axis=1) sc.pp.filter_cells(adata, min_genes=3) sc.pp.filter_genes(adata, min_cells=300) axs = sc.pl.violin(adata, ['n_genes', 'n_counts'], jitter=0.4, multi_panel=True, save="_prelim_violin.png") ax = sc.pl.scatter(adata, x='n_counts', y='n_genes', save="_prelim_n_genes.png") # move files written to tmp shutil.move("/tmp/violin_prelim_violin.png", violin_image_path) shutil.move("/tmp/scatter_prelim_n_genes.png", scatter_image_path) cursor.close() cnx.close() sys.exit()
def main(): schematic_dir = 'img/dataset_schematics' update_sql = "UPDATE dataset SET schematic_image = '{0}' WHERE id = '{1}';" update_list = list() count = 0 cnx = geardb.Connection() cursor = geardb.Connection().get_cursor() qry = "SELECT id, schematic_image FROM dataset WHERE schematic_image IS NOT NULL" cursor.execute(qry) for (id, schematic_image) in cursor: #skip epiviz datasets and any ' ' values (not sure how that one happened) if 'epiviz' not in schematic_image and len(schematic_image) > 2: print("Updating dataset: ", id) filename = schematic_image.rsplit('/', 1)[1] new_filepath = schematic_dir + "/" + filename #Add update sql statment update_list.append(update_sql.format(new_filepath, id)) # Move schematic to new location old_path = '../www/' + schematic_image new_path = '../www/' + new_filepath #Move the schematic file if it's still in the old location if os.path.isfile(old_path): print("\tMoving file to: ", new_path) shutil.move(old_path, new_path) #Update the dataset table for query in update_list: cursor.execute(query) cnx.commit() count += 1 cursor.close() print("Finished.\n{0} datasets were updated.".format(str(count)))
def main(): parser = argparse.ArgumentParser( description='Annotation loader for the gEAR') parser.add_argument('-i', '--input_tab', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_h5ad', type=str, required=True, help='Path to the output file to be created') args = parser.parse_args() cnx = geardb.Connection() cursor = geardb.Connection().get_cursor() cursor.close() cnx.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read.') parser.add_argument('-org', '--organism_id', type=int, required=True, help='Organism ID') args = parser.parse_args() adata = sc.read(args.input_file) qry_ids = adata.var.index.tolist() id_ref = dict() # connect to db #cnx = mysql.connector.connect(user=args.user, password=args.password, host='localhost', database='gear_portal') cnx = geardb.Connection() #cursor = cnx.cursor() cursor = cnx.get_cursor() # get organism to filter out gene table qry = "SELECT ensembl_id, ensembl_release FROM gene WHERE organism_id = %s" cursor.execute(qry, (args.organism_id,)) for (id, release_num) in cursor: if release_num not in id_ref: id_ref[release_num] = dict() id_ref[release_num][id] = 1; best_release = None best_count = 0 for release_num in id_ref: cov_count = get_count(qry_ids, id_ref[release_num]) print("release:{0}\tcount:{1}".format(release_num, cov_count)) if cov_count > best_count: best_count = cov_count best_release = release_num print("The best release is {0} with {1} of {2} genes unaccounted for.".format( best_release, len(qry_ids) - cov_count, len(qry_ids))) cursor.close() cnx.close()
def main(): parser = argparse.ArgumentParser( description='Changes ownership of datasets or profiles') parser.add_argument('-no', '--new_owner_id', type=int, required=True, help='New owner guser.ID' ) parser.add_argument('-id', '--include_displays', action='store_true', help='Changes configured displays also' ) parser.add_argument('-d', '--dataset_id', type=str, required=False, help='Pass a single dataset.id' ) parser.add_argument('-p', '--profile_id', type=str, required=False, help='Pass a profile/layout id' ) args = parser.parse_args() if not args.dataset_id and not args.profile_id: print("ERROR: You must specify either --dataset_id or --profile_id (or both)") cnx = geardb.Connection() cursor = cnx.get_cursor() if args.dataset_id: reown_dataset(cursor, args.dataset_id, args.new_owner_id, args.include_displays) if args.profile_id: reown_profile(cursor, args.profile_id, args.new_owner_id, args.include_displays) cnx.commit() cursor.close() cnx.close()
def main(): parser = argparse.ArgumentParser(description='Lists datasets in the gEAR') parser.add_argument('-uid', '--user_id', type=int, required=False, help='Filter by user ID') args = parser.parse_args() cnx = geardb.Connection() cursor = cnx.get_cursor() qry = """ SELECT d.id, d.title, d.dtype, g.email, d.pubmed_id, d.geo_id, d.is_public, d.date_added FROM dataset d JOIN guser g on d.owner_id=g.id WHERE d.marked_for_removal = 0 """ qry_args = [] if args.user_id: qry += " AND d.owner_id = %s" qry_args.append(args.user_id) qry += " ORDER BY date_added DESC" cursor.execute(qry, qry_args) for (dataset_id, title, dtype, email, pubmed_id, geo_id, is_public, date_added) in cursor: visibility = 'Public' if is_public else 'Private' print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format( dataset_id, title, dtype, email, pubmed_id, geo_id, visibility, date_added)) cursor.close() cnx.close()
def main(): parser = argparse.ArgumentParser( description='See the script name? It does that.') parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an h5ad file to be read.') parser.add_argument('-o', '--output_file', type=str, required=True, help='Output file to be written.') parser.add_argument('-org', '--organism', type=int, required=True, help='Organism ID.') parser.add_argument('-r', '--read_only', type=bool, default=False, help='If true, only print and do not write to output.') args = parser.parse_args() adata = sc.read(args.input_file) (_, n_genes) = adata.shape ensembl_releases = [84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94] cnx = geardb.Connection() cursor = cnx.get_cursor() query = """ SELECT ensembl_id, gene_symbol FROM gene WHERE organism_id = %s AND ensembl_release = %s """ best_release = None best_count = 0 best_df = None # There are some cases where there are duplicate gene symbol, # we do not want to keep these so we drop them here in order # preserve obs shape when joining ensembl ids and resassigning # var later on. duplicated_genes = adata.var.index.duplicated() print('############################################################') print(f"File: {args.input_file}") print(f"Duplicated Genes: {duplicated_genes.sum()}") # print(f"The file, {args.input_file} has {duplicated_genes.sum()} duplicate genes. These will be dropped.") adata = adata[:, ~duplicated_genes] var = adata.var for release in ensembl_releases: print("INFO: comparing with ensembl release: {0} ... ".format(release), end='') cursor.execute(query, (args.organism, release)) df = pd.DataFrame(cursor.fetchall(), columns=cursor.column_names) # Query can return different ensembl ids for a gene symbol, # we want to drop duplicates so we have only one ensembl id # per gene symbol df = df.drop_duplicates('gene_symbol') df = df.set_index('gene_symbol') merged_df = var.join(df, how='inner') (row_count, _) = merged_df.shape print(" found {0} matches".format(row_count)) if row_count > best_count: best_count = row_count best_release = release best_df = merged_df print(f"Best release: {best_release}") print(f"Matches for release: {best_count}") print(f"Original # Genes: {n_genes}") print(f"Genes lost: {n_genes - best_count}") # Now we have our best release and ensembl ids for those gene symbols, # we want to make sure we filter our anndata object to reflect only the # gene symbols we have ids for gene_filter = var.index.isin(best_df.index) adata = adata[:, gene_filter] # If the data already had a 'gene symbol' let's rename it if 'gene_symbol' in best_df.columns: print( "WARN: Found gene_symbol column already in dataset, renaming to gene_symbol_original" ) best_df = best_df.rename( columns={"gene_symbol": "gene_symbol_original"}) ensembl_id_var = (best_df.reset_index().rename(columns={ "index": "gene_symbol" }).set_index('ensembl_id')) # Currently creating a new AnnData object because of # trouble getting adata.var = merged_var to persist adata_with_ensembl_ids = ad.AnnData(adata.X, obs=adata.obs, var=ensembl_id_var) print('VAR\n') print(adata_with_ensembl_ids.var.head()) print("OBS\n") print(adata_with_ensembl_ids.obs.head()) if not args.read_only: adata_with_ensembl_ids.write(args.output_file) print('############################################################')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read.') parser.add_argument('-org', '--organism_id', type=int, required=True, help='Organism ID') args = parser.parse_args() qry_ids = list() id_ref = dict() line_num = 0 for line in open(args.input_file): line = line.rstrip() line_num += 1 if line_num > 1: cols = line.split("\t") qry_ids.append(cols[0]) # connect to db cnx = geardb.Connection() cursor = cnx.get_cursor() key_type = get_key_type(args.input_file) # get organism to filter out gene table qry = "SELECT {0}, ensembl_release FROM gene WHERE organism_id = %s".format( key_type) cursor.execute(qry, (args.organism_id, )) for (id, release_num) in cursor: if release_num not in id_ref: id_ref[release_num] = dict() id_ref[release_num][id] = 1 best_release = None best_count = 0 best_missing_count = 0 for release_num in sorted(id_ref): cov_count, empty_count = get_count(qry_ids, id_ref[release_num]) print("release:{0}\tcount:{1}".format(release_num, cov_count)) if cov_count > best_count: best_count = cov_count best_release = release_num best_missing_count = empty_count print( "The best release is {0} with {1} of {2} genes unaccounted for. Of these, {3} were empty" .format(best_release, len(qry_ids) - cov_count, len(qry_ids), best_missing_count)) cursor.close() cnx.close()
def main(): parser = argparse.ArgumentParser( description='See the script name? It does that.') parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') parser.add_argument('-org', '--organism', type=int, required=True, help='Organism ID to use') parser.add_argument('-er', '--ensembl_release', type=int, required=True, help='Ensembl release ID to use') args = parser.parse_args() cnx = geardb.Connection() cursor = cnx.get_cursor() query = """ SELECT ensembl_id, gene_symbol FROM gene WHERE organism_id = %s AND ensembl_release = %s """ gs_idx = dict() duplicate_gs = 0 gs_not_found = 0 cursor.execute(query, (args.organism, args.ensembl_release)) for row in cursor: ensembl_id = row[0] gs = row[1] if gs not in gs_idx: gs_idx[gs] = ensembl_id else: duplicate_gs += 1 print( "INFO: There were {0} instances where a gene symbol was associated with more than one Ensembl ID" .format(duplicate_gs), file=sys.stderr) cursor.close() cnx.close() line_num = 0 ofh = open(args.output_file, 'w') for line in open(args.input_file): line_num += 1 line = line.rstrip() if line_num == 1: print("Ensembl_ID\t" + line, file=ofh) continue cols = line.split("\t") if cols[0] in gs_idx: cols.insert(0, gs_idx[cols[0]]) print("\t".join(cols), file=ofh) else: print("Gene not found: ({0})".format(cols[0])) gs_not_found += 1 print( "INFO: There were {0} instances of rows skipped in output because their gene symbol wasn't found in the index" .format(gs_not_found), file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description='Annotation loader for the gEAR') parser.add_argument('-i', '--input_dir', type=str, required=True, help='Path to an input directory to be read' ) parser.add_argument('-id', '--organism_id', type=int, required=True, help='Database row ID for the organism being updated') parser.add_argument('-r', '--release_number', type=int, required=True, help='The number portion of the Ensembl release ID') args = parser.parse_args() files = os.listdir(args.input_dir) # For use when debugging DO_DB_LOAD = True if len(files) == 0: raise Exception("ERROR: No input files matching the expected naming convention found") skipped_biotypes = ['misc_RNA', 'pseudogene', 'ribozyme'] cnx = geardb.Connection() cursor = cnx.get_cursor() go_idx = get_go_index(cursor) gene_insert_qry = """ INSERT INTO gene (ensembl_id, ensembl_version, ensembl_release, genbank_acc, organism_id, molecule, start, stop, gene_symbol, product, biotype) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ go_insert_qry = """ INSERT INTO gene_go_link (gene_id, go_id) VALUES (%s, %s) """ dbxref_insert_qry = """ INSERT INTO gene_dbxref (gene_id, dbxref) VALUES (%s, %s) """ for file in sorted(files): if file.endswith('.dat.gz'): genes_inserted = 0 fpath = "{0}/{1}".format(args.input_dir, file) print("Processing file: {0} ... ".format(fpath), file=sys.stderr, end='') # Each of the file names across organisms so far has something like '.chromosome.X.' in # the name. Let's use this to get a molecule name m = re.search("\.(chromosome|nonchromosomal)\.(\w+)", file) if m.group(1) == "chromosome": chr_name = m.group(2) elif m.group(1) == "nonchromosomal": chr_name = "nonchromosomal" else: raise Exception("Error: failed to pull chromosome name from file ({0}). Expected to contain '.chromosome.X'".format(file)) with gzip.open(fpath, 'rt') as fh: for gb_record in SeqIO.parse(fh, "genbank"): annotations = process_gb_record(gb_record) for ann in annotations: if 'genbank_acc' not in ann.other_attributes: ann.other_attributes['genbank_acc'] = None # make sure the product name isn't too long if ann.product_name is not None and len(ann.product_name) > 255: ann.product_name = ann.product_name[:255] # make sure the gene product isn't too long if ann.gene_symbol is not None and len(ann.gene_symbol) > 20: ann.gene_symbol = ann.gene_symbol[:20] if DO_DB_LOAD: cursor.execute(gene_insert_qry, (ann.other_attributes['ensembl_id'], ann.other_attributes['ensembl_version'], args.release_number, ann.other_attributes['genbank_acc'], args.organism_id, chr_name, ann.other_attributes['loc']['fmin'], ann.other_attributes['loc']['fmax'], ann.gene_symbol, ann.product_name, ann.other_attributes['biotype'])) gene_id = cursor.lastrowid genes_inserted += 1 # The Ensembl records have many, many verified duplications in the annotation. Keep this list # so we only load one of each. go_ids_inserted = list() for go_annot in ann.go_annotations: if go_annot.go_id not in go_ids_inserted: if DO_DB_LOAD: cursor.execute(go_insert_qry, (gene_id, "GO:{0}".format(go_annot.go_id))) go_ids_inserted.append(go_annot.go_id) dbxrefs_inserted = list() for dbxref in ann.dbxrefs: label = "{0}:{1}".format(dbxref.db, dbxref.identifier) if label not in dbxrefs_inserted: if DO_DB_LOAD: cursor.execute(dbxref_insert_qry, (gene_id, label)) dbxrefs_inserted.append(label) print("{0} genes added".format(genes_inserted), file=sys.stderr) cnx.commit() cursor.close()
def save_to_mysql(self, status=None): """ Saves metadata to gEAR MySQL table 'dataset'. If present, also saves tags. Notes: is_public = 0 All datasets will save as private. Once the upload is complete, the user can change the dataset to public on the dataset manager. load_status = 'pending' All datasets will save as 'pending'. """ if self.metadata is None: raise Exception( "No values to evaluate. Please load a metadata file first.") if status is None: status = 'pending' df = self.metadata cnx = geardb.Connection() cursor = cnx.get_cursor() dataset_uid = get_value_from_df(df, 'dataset_uid') owner_id = str(get_value_from_df(df, 'owner_id')) dataset_title = get_value_from_df(df, 'title') # Get organism gEAR ID using taxon_id organism_id = None organism_taxid = get_value_from_df(df, 'sample_taxid') organism_qry = ("SELECT id FROM organism WHERE taxon_id = %s") cursor.execute(organism_qry, (str(organism_taxid), )) for (id, ) in cursor: organism_id = id geo_id = str(get_value_from_df(df, 'geo_accession')).strip() # Make all datasets private if not specified in the metadata try: is_public = get_value_from_df(df, 'is_public') except: is_public = 0 ldesc = get_value_from_df(df, 'summary') dtype = get_value_from_df(df, 'dataset_type') schematic_image = get_value_from_df(df, 'schematic_image') share_uid = get_value_from_df(df, 'share_uid') default_data_format = 'raw' has_h5ad = 1 pubmed_id = str(get_value_from_df(df, 'pubmed_id')).strip() # Users entering multiple pubmed IDs will cause failure. Take the first # one instead and append the rest to the Long description. pubmed_id = pubmed_id.replace(' ', ',') pubmed_ids = pubmed_id.split(',') pubmed_ids = [i for i in pubmed_ids if len(i) > 3] pubmed_id = pubmed_ids.pop() if len(pubmed_ids): ldesc += "<br>Additional Pubmed IDS: {0}".format( ', '.join(pubmed_ids)) platform_id = get_value_from_df(df, 'platform_id') instrument_model = get_value_from_df(df, 'instrument_model') library_selection = get_value_from_df(df, 'library_selection') library_source = get_value_from_df(df, 'library_source') library_strategy = get_value_from_df(df, 'library_strategy') contact_email = get_value_from_df(df, 'contact_email') contact_institute = get_value_from_df(df, 'contact_institute') contact_name = get_value_from_df(df, 'contact_name') annotation_source = get_value_from_df(df, 'annotation_source') annotation_release = get_value_from_df(df, 'annotation_release_number') default_plot_type = get_value_from_df(df, 'default_plot_type') add_dataset_sql = """ INSERT INTO dataset (id, owner_id, title, organism_id, pubmed_id, geo_id, is_public, ldesc, date_added, dtype, schematic_image, share_id, math_default, load_status, has_h5ad, platform_id, instrument_model, library_selection, library_source, library_strategy, contact_email, contact_institute, contact_name, annotation_source, annotation_release, plot_default) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, NOW(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ # Insert dataset info to database try: cursor.execute(add_dataset_sql, ( dataset_uid, owner_id, dataset_title, organism_id, pubmed_id, geo_id, is_public, ldesc, dtype, schematic_image, share_uid, default_data_format, status, has_h5ad, platform_id, instrument_model, library_selection, library_source, library_strategy, contact_email, contact_institute, contact_name, annotation_source, annotation_release, default_plot_type, )) cnx.commit() except mysql.connector.Error as err: raise Exception("Failed to insert metadata: {0}".format(err)) #Handle any tags user might have included tags = get_value_from_df(df, 'tags') # if tags is not None or tags != np.nan: if is_na(tags) is False and tags is not None: tag_list = [] if isinstance(tags, str): if ',' in tags: raw_tags = tags.split(', ') else: raw_tags = [tags] elif isinstance(tags, list): raw_tags = tags # Ensures duplicates are removed for tag in raw_tags: if tag not in tag_list: tag_list.append(tag) #Get list of tags already in gEAR qry_get_tags = """ SELECT label, id FROM tag """ cached_tags = {} cursor.execute(qry_get_tags) for row in cursor: cached_tags[row[0].lower()] = row[1] add_tag_sql = """ INSERT INTO tag (label) VALUES (%s) """ add_datasettag_sql = """ INSERT INTO dataset_tag (tag_id, dataset_id) VALUES (%s, %s) """ for tag in tag_list: #Check if new tag is already in database if tag.lower() not in cached_tags: #New tag. Add it to database and keep its id cursor.execute(add_tag_sql, (tag, )) cnx.commit() tag_id = cursor.lastrowid else: #Tag exists. Get its id tag_id = cached_tags[tag.lower()] cursor.execute(add_datasettag_sql, ( tag_id, dataset_uid, )) cnx.commit() cursor.close()