Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-id',
                        '--dataset_id',
                        type=str,
                        required=True,
                        help='ID of the dataset to update')
    args = parser.parse_args()

    comments = htmlmin.minify(open(args.input_file).read(),
                              remove_empty_space=True)

    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    qry = "UPDATE dataset SET ldesc = %s WHERE id = %s"
    cursor.execute(qry, (comments, args.dataset_id))

    cursor.close()
    cnx.commit()
    print("Finished.\n Dataset updated.")
def main():
    cnx = geardb.Connection()

    cursor = cnx.get_cursor()
    marked_dataset_ids = list()
    query = """
        SELECT id
        FROM layout
        WHERE share_id is NULL
    """

    cursor.execute(query, )

    ids = list()
    for row in cursor:
        ids.append(row[0])

    for id in ids:
        # Get just the first block of the UUID
        share_id = str(uuid.uuid4()).split('-')[0]
        print(share_id)
        qry = """
            UPDATE layout
            SET share_id = %s
            WHERE id = %s
        """
        cursor.execute(qry, (
            share_id,
            id,
        ))
        cnx.commit()
def main():
    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    # perhaps add a basic SELECT query here?

    cursor.close()
    cnx.close()
Beispiel #4
0
def main():
    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    dataset_ids = get_dataset_id_list(cursor)

    for dataset_id in dataset_ids:
        print("Processing dataset ID: {0}".format(dataset_id))
        dataset = geardb.Dataset(id=dataset_id, has_h5ad=1)
        h5ad_path = dataset.get_file_path()

        if os.path.exists(h5ad_path):
            print("\tFile found: {0}".format(h5ad_path))
        else:
            print("\tFile not found, skipping: {0}".format(h5ad_path))
            continue

        violin_image_path = h5ad_path.replace('.h5ad', '.prelim_violin.png')
        scatter_image_path = h5ad_path.replace('.h5ad', '.prelim_n_genes.png')

        # Skip if the images are already there
        if os.path.exists(violin_image_path) and os.path.exists(scatter_image_path):
            print("\tImages already found, skipping")
            continue

        ana = geardb.Analysis(dataset_id=dataset_id, type='primary')
        adata = ana.get_adata()

        print("\tGenerating figures")

        # the ".A1" is only necessary, as X is sparse - it transform to a dense array after summing
        try:
            # add the total counts per cell as observations-annotation to adata
            adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1

        except AttributeError:
            # add the total counts per cell as observations-annotation to adata
            adata.obs['n_counts'] = np.sum(adata.X, axis=1)

        sc.pp.filter_cells(adata, min_genes=3)
        sc.pp.filter_genes(adata, min_cells=300)

        axs = sc.pl.violin(adata, ['n_genes', 'n_counts'],
                       jitter=0.4, multi_panel=True, save="_prelim_violin.png")

        ax = sc.pl.scatter(adata, x='n_counts', y='n_genes', save="_prelim_n_genes.png")

        # move files written to tmp
        shutil.move("/tmp/violin_prelim_violin.png", violin_image_path)
        shutil.move("/tmp/scatter_prelim_n_genes.png", scatter_image_path)

    cursor.close()
    cnx.close()

    sys.exit()
Beispiel #5
0
def main():

    schematic_dir = 'img/dataset_schematics'
    update_sql = "UPDATE dataset SET schematic_image = '{0}' WHERE id = '{1}';"
    update_list = list()
    count = 0

    cnx = geardb.Connection()
    cursor = geardb.Connection().get_cursor()

    qry = "SELECT id, schematic_image FROM dataset WHERE schematic_image IS NOT NULL"
    cursor.execute(qry)
    for (id, schematic_image) in cursor:
        #skip epiviz datasets and any ' ' values (not sure how that one happened)
        if 'epiviz' not in schematic_image and len(schematic_image) > 2:
            print("Updating dataset: ", id)
            filename = schematic_image.rsplit('/', 1)[1]

            new_filepath = schematic_dir + "/" + filename

            #Add update sql statment
            update_list.append(update_sql.format(new_filepath, id))

            # Move schematic to new location
            old_path = '../www/' + schematic_image
            new_path = '../www/' + new_filepath

            #Move the schematic file if it's still in the old location
            if os.path.isfile(old_path):
                print("\tMoving file to: ", new_path)
                shutil.move(old_path, new_path)

    #Update the dataset table
    for query in update_list:
        cursor.execute(query)
        cnx.commit()

        count += 1

    cursor.close()
    print("Finished.\n{0} datasets were updated.".format(str(count)))
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Annotation loader for the gEAR')
    parser.add_argument('-i',
                        '--input_tab',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_h5ad',
                        type=str,
                        required=True,
                        help='Path to the output file to be created')

    args = parser.parse_args()

    cnx = geardb.Connection()
    cursor = geardb.Connection().get_cursor()

    cursor.close()
    cnx.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read.')
    parser.add_argument('-org', '--organism_id', type=int, required=True, help='Organism ID')
    args = parser.parse_args()

    adata = sc.read(args.input_file)
    qry_ids = adata.var.index.tolist()
    id_ref = dict()

    # connect to db
    #cnx = mysql.connector.connect(user=args.user, password=args.password, host='localhost', database='gear_portal')
    cnx = geardb.Connection()
    #cursor = cnx.cursor()
    cursor = cnx.get_cursor()

    # get organism to filter out gene table
    qry = "SELECT ensembl_id, ensembl_release FROM gene WHERE organism_id = %s"
    cursor.execute(qry, (args.organism_id,))

    for (id, release_num) in cursor:
        if release_num not in id_ref:
            id_ref[release_num] = dict()

        id_ref[release_num][id] = 1;

    best_release = None
    best_count = 0

    for release_num in id_ref:
        cov_count = get_count(qry_ids, id_ref[release_num])
        print("release:{0}\tcount:{1}".format(release_num, cov_count))

        if cov_count > best_count:
            best_count = cov_count
            best_release = release_num

    print("The best release is {0} with {1} of {2} genes unaccounted for.".format(
        best_release, len(qry_ids) - cov_count, len(qry_ids)))

    cursor.close()
    cnx.close()
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser( description='Changes ownership of datasets or profiles')
    parser.add_argument('-no', '--new_owner_id', type=int, required=True, help='New owner guser.ID' )
    parser.add_argument('-id', '--include_displays', action='store_true', help='Changes configured displays also' )
    parser.add_argument('-d', '--dataset_id', type=str, required=False, help='Pass a single dataset.id' )
    parser.add_argument('-p', '--profile_id', type=str, required=False, help='Pass a profile/layout id' )
    args = parser.parse_args()

    if not args.dataset_id and not args.profile_id:
        print("ERROR: You must specify either --dataset_id or --profile_id (or both)")
    
    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    if args.dataset_id:
        reown_dataset(cursor, args.dataset_id, args.new_owner_id, args.include_displays)

    if args.profile_id:
        reown_profile(cursor, args.profile_id, args.new_owner_id, args.include_displays)

    cnx.commit()
    cursor.close()
    cnx.close()
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(description='Lists datasets in the gEAR')
    parser.add_argument('-uid',
                        '--user_id',
                        type=int,
                        required=False,
                        help='Filter by user ID')
    args = parser.parse_args()

    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    qry = """
          SELECT d.id, d.title, d.dtype, g.email, d.pubmed_id, d.geo_id, d.is_public, d.date_added
            FROM dataset d
                 JOIN guser g on d.owner_id=g.id
           WHERE d.marked_for_removal = 0
    """
    qry_args = []

    if args.user_id:
        qry += " AND d.owner_id = %s"
        qry_args.append(args.user_id)

    qry += " ORDER BY date_added DESC"

    cursor.execute(qry, qry_args)

    for (dataset_id, title, dtype, email, pubmed_id, geo_id, is_public,
         date_added) in cursor:
        visibility = 'Public' if is_public else 'Private'
        print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
            dataset_id, title, dtype, email, pubmed_id, geo_id, visibility,
            date_added))

    cursor.close()
    cnx.close()
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser(
        description='See the script name?  It does that.')
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an h5ad file to be read.')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Output file to be written.')
    parser.add_argument('-org',
                        '--organism',
                        type=int,
                        required=True,
                        help='Organism ID.')
    parser.add_argument('-r',
                        '--read_only',
                        type=bool,
                        default=False,
                        help='If true, only print and do not write to output.')

    args = parser.parse_args()

    adata = sc.read(args.input_file)
    (_, n_genes) = adata.shape

    ensembl_releases = [84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94]

    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    query = """
        SELECT ensembl_id, gene_symbol
          FROM gene
         WHERE organism_id = %s
           AND ensembl_release = %s
    """

    best_release = None
    best_count = 0
    best_df = None

    # There are some cases where there are duplicate gene symbol,
    # we do not want to keep these so we drop them here in order
    # preserve obs shape when joining ensembl ids and resassigning
    # var later on.
    duplicated_genes = adata.var.index.duplicated()
    print('############################################################')
    print(f"File: {args.input_file}")
    print(f"Duplicated Genes: {duplicated_genes.sum()}")

    # print(f"The file, {args.input_file} has {duplicated_genes.sum()} duplicate genes. These will be dropped.")
    adata = adata[:, ~duplicated_genes]
    var = adata.var

    for release in ensembl_releases:
        print("INFO: comparing with ensembl release: {0} ... ".format(release),
              end='')
        cursor.execute(query, (args.organism, release))

        df = pd.DataFrame(cursor.fetchall(), columns=cursor.column_names)

        # Query can return different ensembl ids for a gene symbol,
        # we want to drop duplicates so we have only one ensembl id
        # per gene symbol
        df = df.drop_duplicates('gene_symbol')
        df = df.set_index('gene_symbol')

        merged_df = var.join(df, how='inner')
        (row_count, _) = merged_df.shape

        print(" found {0} matches".format(row_count))

        if row_count > best_count:
            best_count = row_count
            best_release = release
            best_df = merged_df

    print(f"Best release: {best_release}")
    print(f"Matches for release: {best_count}")
    print(f"Original # Genes: {n_genes}")
    print(f"Genes lost: {n_genes - best_count}")
    # Now we have our best release and ensembl ids for those gene symbols,
    # we want to make sure we filter our anndata object to reflect only the
    # gene symbols we have ids for
    gene_filter = var.index.isin(best_df.index)
    adata = adata[:, gene_filter]

    # If the data already had a 'gene symbol' let's rename it
    if 'gene_symbol' in best_df.columns:
        print(
            "WARN: Found gene_symbol column already in dataset, renaming to gene_symbol_original"
        )
        best_df = best_df.rename(
            columns={"gene_symbol": "gene_symbol_original"})

    ensembl_id_var = (best_df.reset_index().rename(columns={
        "index": "gene_symbol"
    }).set_index('ensembl_id'))

    # Currently creating a new AnnData object because of
    # trouble getting adata.var = merged_var to persist
    adata_with_ensembl_ids = ad.AnnData(adata.X,
                                        obs=adata.obs,
                                        var=ensembl_id_var)
    print('VAR\n')
    print(adata_with_ensembl_ids.var.head())
    print("OBS\n")
    print(adata_with_ensembl_ids.obs.head())
    if not args.read_only:
        adata_with_ensembl_ids.write(args.output_file)
    print('############################################################')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read.')
    parser.add_argument('-org',
                        '--organism_id',
                        type=int,
                        required=True,
                        help='Organism ID')
    args = parser.parse_args()

    qry_ids = list()
    id_ref = dict()

    line_num = 0
    for line in open(args.input_file):
        line = line.rstrip()
        line_num += 1

        if line_num > 1:
            cols = line.split("\t")
            qry_ids.append(cols[0])

    # connect to db
    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    key_type = get_key_type(args.input_file)

    # get organism to filter out gene table
    qry = "SELECT {0}, ensembl_release FROM gene WHERE organism_id = %s".format(
        key_type)
    cursor.execute(qry, (args.organism_id, ))

    for (id, release_num) in cursor:
        if release_num not in id_ref:
            id_ref[release_num] = dict()

        id_ref[release_num][id] = 1

    best_release = None
    best_count = 0
    best_missing_count = 0

    for release_num in sorted(id_ref):
        cov_count, empty_count = get_count(qry_ids, id_ref[release_num])
        print("release:{0}\tcount:{1}".format(release_num, cov_count))

        if cov_count > best_count:
            best_count = cov_count
            best_release = release_num
            best_missing_count = empty_count

    print(
        "The best release is {0} with {1} of {2} genes unaccounted for.  Of these, {3} were empty"
        .format(best_release,
                len(qry_ids) - cov_count, len(qry_ids), best_missing_count))

    cursor.close()
    cnx.close()
def main():
    parser = argparse.ArgumentParser(
        description='See the script name?  It does that.')
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument('-org',
                        '--organism',
                        type=int,
                        required=True,
                        help='Organism ID to use')
    parser.add_argument('-er',
                        '--ensembl_release',
                        type=int,
                        required=True,
                        help='Ensembl release ID to use')
    args = parser.parse_args()

    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    query = """
        SELECT ensembl_id, gene_symbol
          FROM gene
         WHERE organism_id = %s
           AND ensembl_release = %s
    """

    gs_idx = dict()
    duplicate_gs = 0
    gs_not_found = 0

    cursor.execute(query, (args.organism, args.ensembl_release))
    for row in cursor:
        ensembl_id = row[0]
        gs = row[1]

        if gs not in gs_idx:
            gs_idx[gs] = ensembl_id
        else:
            duplicate_gs += 1

    print(
        "INFO: There were {0} instances where a gene symbol was associated with more than one Ensembl ID"
        .format(duplicate_gs),
        file=sys.stderr)

    cursor.close()
    cnx.close()

    line_num = 0

    ofh = open(args.output_file, 'w')

    for line in open(args.input_file):
        line_num += 1
        line = line.rstrip()

        if line_num == 1:
            print("Ensembl_ID\t" + line, file=ofh)
            continue

        cols = line.split("\t")

        if cols[0] in gs_idx:
            cols.insert(0, gs_idx[cols[0]])
            print("\t".join(cols), file=ofh)
        else:
            print("Gene not found: ({0})".format(cols[0]))
            gs_not_found += 1

    print(
        "INFO: There were {0} instances of rows skipped in output because their gene symbol wasn't found in the index"
        .format(gs_not_found),
        file=sys.stderr)
def main():
    parser = argparse.ArgumentParser( description='Annotation loader for the gEAR')
    parser.add_argument('-i', '--input_dir', type=str, required=True, help='Path to an input directory to be read' )
    parser.add_argument('-id', '--organism_id', type=int, required=True, help='Database row ID for the organism being updated')
    parser.add_argument('-r', '--release_number', type=int, required=True, help='The number portion of the Ensembl release ID')
    args = parser.parse_args()

    files = os.listdir(args.input_dir)

    # For use when debugging
    DO_DB_LOAD = True

    if len(files) == 0:
        raise Exception("ERROR: No input files matching the expected naming convention found")

    skipped_biotypes = ['misc_RNA', 'pseudogene', 'ribozyme']

    cnx = geardb.Connection()
    cursor = cnx.get_cursor()

    go_idx = get_go_index(cursor)

    gene_insert_qry = """
        INSERT INTO gene (ensembl_id, ensembl_version, ensembl_release, genbank_acc, 
                          organism_id, molecule, start, stop, gene_symbol, product, 
                          biotype) 
             VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    go_insert_qry = """
        INSERT INTO gene_go_link (gene_id, go_id)
             VALUES (%s, %s)
    """

    dbxref_insert_qry = """
        INSERT INTO gene_dbxref (gene_id, dbxref)
             VALUES (%s, %s)
    """

    for file in sorted(files):
        if file.endswith('.dat.gz'):
            genes_inserted = 0
            
            fpath = "{0}/{1}".format(args.input_dir, file)
            print("Processing file: {0} ... ".format(fpath), file=sys.stderr, end='')

            # Each of the file names across organisms so far has something like '.chromosome.X.' in
            #  the name.  Let's use this to get a molecule name
            m = re.search("\.(chromosome|nonchromosomal)\.(\w+)", file)
            if m.group(1) == "chromosome":
                chr_name = m.group(2)
            elif m.group(1) == "nonchromosomal":
                chr_name = "nonchromosomal"
            else:
                raise Exception("Error: failed to pull chromosome name from file ({0}).  Expected to contain '.chromosome.X'".format(file))

            with gzip.open(fpath, 'rt') as fh:
                for gb_record in SeqIO.parse(fh, "genbank"):
                    annotations = process_gb_record(gb_record)

                    for ann in annotations:
                        if 'genbank_acc' not in ann.other_attributes:
                            ann.other_attributes['genbank_acc'] = None

                        # make sure the product name isn't too long
                        if ann.product_name is not None and len(ann.product_name) > 255:
                            ann.product_name = ann.product_name[:255]

                        # make sure the gene product isn't too long
                        if ann.gene_symbol is not None and len(ann.gene_symbol) > 20:
                            ann.gene_symbol = ann.gene_symbol[:20]

                        if DO_DB_LOAD:
                            cursor.execute(gene_insert_qry, (ann.other_attributes['ensembl_id'], ann.other_attributes['ensembl_version'], 
                                               args.release_number, ann.other_attributes['genbank_acc'], 
                                               args.organism_id, chr_name, ann.other_attributes['loc']['fmin'],
                                               ann.other_attributes['loc']['fmax'], ann.gene_symbol, ann.product_name,
                                               ann.other_attributes['biotype']))
                            gene_id = cursor.lastrowid
                        genes_inserted += 1

                        # The Ensembl records have many, many verified duplications in the annotation.  Keep this list
                        #  so we only load one of each.
                        go_ids_inserted = list()
                        for go_annot in ann.go_annotations:
                            if go_annot.go_id not in go_ids_inserted:
                                if DO_DB_LOAD:
                                    cursor.execute(go_insert_qry, (gene_id, "GO:{0}".format(go_annot.go_id)))
                                go_ids_inserted.append(go_annot.go_id)

                        dbxrefs_inserted = list()
                        for dbxref in ann.dbxrefs:
                            label = "{0}:{1}".format(dbxref.db, dbxref.identifier)
                            if label not in dbxrefs_inserted:
                                if DO_DB_LOAD:
                                    cursor.execute(dbxref_insert_qry, (gene_id, label))
                                dbxrefs_inserted.append(label)

            print("{0} genes added".format(genes_inserted), file=sys.stderr)
        cnx.commit()

    cursor.close()
Beispiel #14
0
    def save_to_mysql(self, status=None):
        """
        Saves metadata to gEAR MySQL table 'dataset'. If present, also saves tags.

        Notes:
            is_public = 0
                All datasets will save as private. Once the upload is complete,
                the user can change the dataset to public on the dataset manager.
            load_status = 'pending'
                All datasets will save as 'pending'.
        """
        if self.metadata is None:
            raise Exception(
                "No values to evaluate. Please load a metadata file first.")

        if status is None:
            status = 'pending'

        df = self.metadata

        cnx = geardb.Connection()
        cursor = cnx.get_cursor()

        dataset_uid = get_value_from_df(df, 'dataset_uid')
        owner_id = str(get_value_from_df(df, 'owner_id'))
        dataset_title = get_value_from_df(df, 'title')

        # Get organism gEAR ID using taxon_id
        organism_id = None
        organism_taxid = get_value_from_df(df, 'sample_taxid')
        organism_qry = ("SELECT id FROM organism WHERE taxon_id = %s")
        cursor.execute(organism_qry, (str(organism_taxid), ))
        for (id, ) in cursor:
            organism_id = id

        geo_id = str(get_value_from_df(df, 'geo_accession')).strip()

        # Make all datasets private if not specified in the metadata
        try:
            is_public = get_value_from_df(df, 'is_public')
        except:
            is_public = 0

        ldesc = get_value_from_df(df, 'summary')
        dtype = get_value_from_df(df, 'dataset_type')
        schematic_image = get_value_from_df(df, 'schematic_image')
        share_uid = get_value_from_df(df, 'share_uid')
        default_data_format = 'raw'
        has_h5ad = 1

        pubmed_id = str(get_value_from_df(df, 'pubmed_id')).strip()

        # Users entering multiple pubmed IDs will cause failure.  Take the first
        # one instead and append the rest to the Long description.
        pubmed_id = pubmed_id.replace(' ', ',')
        pubmed_ids = pubmed_id.split(',')
        pubmed_ids = [i for i in pubmed_ids if len(i) > 3]
        pubmed_id = pubmed_ids.pop()

        if len(pubmed_ids):
            ldesc += "<br>Additional Pubmed IDS: {0}".format(
                ', '.join(pubmed_ids))

        platform_id = get_value_from_df(df, 'platform_id')
        instrument_model = get_value_from_df(df, 'instrument_model')
        library_selection = get_value_from_df(df, 'library_selection')
        library_source = get_value_from_df(df, 'library_source')
        library_strategy = get_value_from_df(df, 'library_strategy')
        contact_email = get_value_from_df(df, 'contact_email')
        contact_institute = get_value_from_df(df, 'contact_institute')
        contact_name = get_value_from_df(df, 'contact_name')
        annotation_source = get_value_from_df(df, 'annotation_source')
        annotation_release = get_value_from_df(df, 'annotation_release_number')
        default_plot_type = get_value_from_df(df, 'default_plot_type')

        add_dataset_sql = """
        INSERT INTO dataset (id, owner_id, title, organism_id, pubmed_id, geo_id, is_public, ldesc, date_added, dtype, schematic_image, share_id, math_default, load_status, has_h5ad, platform_id, instrument_model, library_selection, library_source, library_strategy, contact_email, contact_institute, contact_name, annotation_source, annotation_release, plot_default)
        VALUES              (%s, %s,       %s,    %s,          %s,        %s,     %s,        %s,    NOW(),      %s,    %s,              %s,       %s,           %s,          %s,       %s,          %s,               %s,                %s,             %s,               %s,            %s,                %s,           %s,                %s,                 %s)
        """

        # Insert dataset info to database
        try:
            cursor.execute(add_dataset_sql, (
                dataset_uid,
                owner_id,
                dataset_title,
                organism_id,
                pubmed_id,
                geo_id,
                is_public,
                ldesc,
                dtype,
                schematic_image,
                share_uid,
                default_data_format,
                status,
                has_h5ad,
                platform_id,
                instrument_model,
                library_selection,
                library_source,
                library_strategy,
                contact_email,
                contact_institute,
                contact_name,
                annotation_source,
                annotation_release,
                default_plot_type,
            ))
            cnx.commit()
        except mysql.connector.Error as err:
            raise Exception("Failed to insert metadata: {0}".format(err))

        #Handle any tags user might have included
        tags = get_value_from_df(df, 'tags')

        # if tags is not None or tags != np.nan:
        if is_na(tags) is False and tags is not None:
            tag_list = []

            if isinstance(tags, str):
                if ',' in tags:
                    raw_tags = tags.split(', ')
                else:
                    raw_tags = [tags]
            elif isinstance(tags, list):
                raw_tags = tags

            # Ensures duplicates are removed
            for tag in raw_tags:
                if tag not in tag_list:
                    tag_list.append(tag)

            #Get list of tags already in gEAR
            qry_get_tags = """
                SELECT label, id
                FROM tag
            """
            cached_tags = {}
            cursor.execute(qry_get_tags)
            for row in cursor:
                cached_tags[row[0].lower()] = row[1]

            add_tag_sql = """
                INSERT INTO tag (label)
                VALUES (%s)
            """
            add_datasettag_sql = """
                INSERT INTO dataset_tag (tag_id, dataset_id)
                VALUES (%s, %s)
            """
            for tag in tag_list:
                #Check if new tag is already in database
                if tag.lower() not in cached_tags:
                    #New tag. Add it to database and keep its id
                    cursor.execute(add_tag_sql, (tag, ))
                    cnx.commit()
                    tag_id = cursor.lastrowid
                else:
                    #Tag exists. Get its id
                    tag_id = cached_tags[tag.lower()]

                cursor.execute(add_datasettag_sql, (
                    tag_id,
                    dataset_uid,
                ))
                cnx.commit()

        cursor.close()