class PMID(Base): """This class holds a lookup table of rsids.""" __tablename__ = 'pmids' pmid = _Column(_String, primary_key=True) title = _Column(_String, nullable=True) pubdate = _Column(_Date, nullable=True) def __init__(self, pmid, add_study_data=True): """Get info from ID and set.""" self.pmid = pmid if add_study_data: try: title, pubdate = get_pubmed_info(pmid) self.title = title self.pubdate = pubdate except: pass def __repr__(self): """Display summary stats.""" return "PMID<pmid={pmid}, title={title}, pubdate={pubdate}>".format( pmid=self.pmid, title=self.title, pubdate=self.pubdate) def __len__(self): """Number of SNPs.""" return len(self.snps)
class Phenotype(Base): """An SQLAlchemy table to store the primary phenotype. Table Name: phenos Columns: phenotype: The string phenotype from the GRASP DB, unique. alias: A short representation of the phenotype, not unique. studies: A link to the studies table. Attributes: int: The ID number. str: The name of the phenotype. """ __tablename__ = "phenos" id = _Column(_Integer, primary_key=True, index=True) phenotype = _Column(_String, index=True, unique=True) alias = _Column(_String, index=True) studies = _relationship("Study", back_populates="phenotype") def __repr__(self): """Display information.""" return '{} <"{}" (alias: {})>'.format(self.id, self.phenotype, self.alias) def __int__(self): """Return ID number.""" return self.id def __str__(self): """Display phenotype name.""" return "{}".format(self.phenotype)
class Trait(Base): """This class holds a lookup table of traits.""" __tablename__ = 'traits' id = _Column(_Integer, primary_key=True) trait = _Column(_String, index=True, nullable=False) def __repr__(self): """Display summary stats.""" return "Trait<id={id}, trait={trait}>".format(id=self.id, trait=self.trait) def __len__(self): """Number of SNPs.""" return len(self.snps)
class PhenoCats(Base): """An SQLAlchemy table to store the lists of phenotype categories. Table Name: pheno_cats Columns: category: The category from the grasp database, unique. alias: An easy to use alias of the category, not unique. snps: A link to all SNPs in this category. studies: A link to all studies in this category. Attributes: int: The PhenoCat ID str: The category name """ __tablename__ = "pheno_cats" id = _Column(_Integer, primary_key=True, index=True) category = _Column(_String, index=True, unique=True) alias = _Column(_String, index=True) snps = _relationship("SNP", secondary=snp_pheno_assoc, back_populates="phenotype_cats") studies = _relationship("Study", secondary=study_pheno_assoc, back_populates="phenotype_cats") def __repr__(self): """Display information.""" return '{} <"{}" (alias: {})>'.format(self.id, self.category, self.alias) def __int__(self): """Return ID number.""" return self.id def __str__(self): """Display phenotype name.""" return "{}".format(self.category)
class Platform(Base): """An SQLAlchemy table to store the platform information. Table Name: platforms Columns: platform: The name of the platform from GRASP. studies: A link to all studies using this platform. Attributes: int: The ID number of this platform str: The name of the platform """ __tablename__ = "platforms" id = _Column(_Integer, primary_key=True, index=True) platform = _Column(_String, index=True, unique=True) studies = _relationship("Study", secondary=study_plat_assoc, back_populates="platforms") def __init__(self, platform): """Create self.""" self.platform = platform def __repr__(self): """Display information.""" return "{} <{}>".format(self.id, self.platform) def __int__(self): """Return ID number.""" return self.id def __str__(self): """Display platform name.""" return "{}".format(self.platform)
class Population(Base): """An SQLAlchemy table to store the platform information. Table Name: populations Columns: population: The name of the population. studies: A link to all studies in this population. snps: A link to all SNPs in this populations. Attributes: int: Population ID number str: The name of the population """ __tablename__ = "populations" id = _Column(_Integer, primary_key=True, index=True) population = _Column(_String, index=True, unique=True) def __init__(self, population): """Create self.""" self.population = population def __repr__(self): """Display information.""" return "{} <{}>".format(self.id, self.population) def __int__(self): """Return ID number.""" return self.id def __str__(self): """Display platform name.""" return "{}".format(self.population)
class SNP(Base): """An SQLAlchemy Talble for GRASP SNPs. Study and phenotype information are pushed to other tables to minimize table size and make querying easier. Table Name: snps Columns: Described in the columns attribute Attributes: int: The ID number of the SNP, usually the NHLBIkey str: SNP loction expressed as 'chr:pos' hvgs_ids: A list of HGVS IDs for this SNP columns: A dictionary of all columns 'column_name'=>('type', 'desc') """ __tablename__ = "snps" id = _Column(_BigInteger, primary_key=True, index=True) snpid = _Column(_String, index=True) chrom = _Column(_String(10), index=True) pos = _Column(_Integer, index=True) pval = _Column(_Float, index=True) NHLBIkey = _Column(_String, index=True) HUPfield = _Column(_String) LastCurationDate = _Column(_Date) CreationDate = _Column(_Date) population_id = _Column(_Integer, _ForeignKey('populations.id'), index=True) population = _relationship("Population", backref="snps") study_id = _Column(_Integer, _ForeignKey('studies.id'), index=True) study = _relationship("Study", back_populates="snps") study_snpid = _Column(_String, index=True) paper_loc = _Column(_String) phenotype_desc = _Column(_String, index=True) phenotype_cats = _relationship("PhenoCats", secondary=snp_pheno_assoc, back_populates="snps") InGene = _Column(_String, index=True) NearestGene = _Column(_String, index=True) InLincRNA = _Column(_String) InMiRNA = _Column(_String) InMiRNABS = _Column(_String) dbSNPfxn = _Column(_String) dbSNPMAF = _Column(_String) dbSNPinfo = _Column(_String) dbSNPvalidation = _Column(_String) dbSNPClinStatus = _Column(_String) ORegAnno = _Column(_String) ConservPredTFBS = _Column(_String) HumanEnhancer = _Column(_String) RNAedit = _Column(_String) PolyPhen2 = _Column(_String) SIFT = _Column(_String) LSSNP = _Column(_String) UniProt = _Column(_String) EqtlMethMetabStudy = _Column(_String) _Index('chrom_pos', 'chrom', 'pos') columns = _od([ ('id', ('BigInteger', 'NHLBIkey')), ('snpid', ('String', 'SNPid')), ('chrom', ('String', 'chr')), ('pos', ('Integer', 'pos')), ('pval', ('Float', 'Pvalue')), ('NHLBIkey', ('String', 'NHLBIkey')), ('HUPfield', ('String', 'HUPfield')), ('LastCurationDate', ('Date', 'LastCurationDate')), ('CreationDate', ('Date', 'CreationDate')), ('population_id', ('Integer', 'Primary')), ('population', ('relationship', 'Link')), ('study_id', ('Integer', 'Primary')), ('study', ('relationship', 'Link')), ('study_snpid', ('String', 'SNPid')), ('paper_loc', ('String', 'LocationWithinPaper')), ('phenotype_desc', ('String', 'Phenotype')), ('phenotype_cats', ('relationship', 'Link')), ('InGene', ('String', 'InGene')), ('NearestGene', ('String', 'NearestGene')), ('InLincRNA', ('String', 'InLincRNA')), ('InMiRNA', ('String', 'InMiRNA')), ('InMiRNABS', ('String', 'InMiRNABS')), ('dbSNPfxn', ('String', 'dbSNPfxn')), ('dbSNPMAF', ('String', 'dbSNPMAF')), ('dbSNPinfo', ('String', 'dbSNPalleles')), ('dbSNPvalidation', ('String', 'dbSNPvalidation')), ('dbSNPClinStatus', ('String', 'dbSNPClinStatus')), ('ORegAnno', ('String', 'ORegAnno')), ('ConservPredTFBS', ('String', 'ConservPredTFBS')), ('HumanEnhancer', ('String', 'HumanEnhancer')), ('RNAedit', ('String', 'RNAedit')), ('PolyPhen2', ('String', 'PolyPhen2')), ('SIFT', ('String', 'SIFT')), ('LSSNP', ('String', 'LS')), ('UniProt', ('String', 'UniProt')), ('EqtlMethMetabStudy', ('String', 'EqtlMethMetabStudy')), ]) """A description of all columns in this table.""" @property def snp_loc(self): """Return a simple string containing the SNP location.""" return "chr{}:{}".format(self.chrom, self.pos) @property def hvgs_ids(self): """The HVGS ID from myvariant.""" if not hasattr(self, '_hvgs_ids'): mv = myvariant.MyVariantInfo() self._hvgs_ids = [ i['_id'] for i in mv.query(self.snp_loc, fields='id')['hits'] ] return self._hvgs_ids def get_variant_info(self, fields="dbsnp", pandas=True): """Use the myvariant API to get info about this SNP. Note that this service can be very slow. It will be faster to query multiple SNPs. Args: fields: Choose fields to display from: `docs.myvariant.info/en/latest/doc/data.html#available-fields`_ Good choices are 'dbsnp', 'clinvar', or 'gwassnps' Can also use 'grasp' to get a different version of this info. pandas: Return a dataframe instead of dictionary. Returns: A dictionary or a dataframe. """ mv = myvariant.MyVariantInfo() return mv.getvariants(self.hvgs_ids, fields=fields, as_dataframe=pandas, df_index=True) def get_columns(self, return_as='list'): """Return all columns in the table nicely formatted. Display choices: list: A python list of column names dictionary: A python dictionary of name=>desc long_dict: A python dictionary of name=>(type, desc) Args: return_as: {table,tab,list,dictionary,long_dict,id_dict} Returns: A list or dictionary """ cols = self.columns if return_as == 'list': return [i[1] for i in cols.values()] elif return_as == 'dictionary': return {k: v[1] for k, v in cols.items()} elif return_as == 'long_dict': return cols else: raise Exception("'display_as' must be one of {table,tab,list}") def display_columns(self, display_as='table', write=False): """Return all columns in the table nicely formatted. Display choices: table: A formatted grid-like table tab: A tab delimited non-formatted version of table list: A string list of column names Args: display_as: {table,tab,list} write: If true, print output to console, otherwise return string. Returns: A formatted string or None """ cols = self.columns if display_as == 'table': out = _tb( [['Column', 'Description', 'Type']] +\ [[k, v[1], v[0]] for k, v in cols.items()], headers='firstrow', tablefmt='grid' ) elif display_as == 'tab': out = '\n'.join( ['\t'.join(['Column', 'Description', 'Type'])] +\ ['\t'.join([k, v[1], v[0]]) for k, v in cols.items()], ) elif display_as == 'list': out = '\n'.join([i[1] for i in cols.values()]) else: raise Exception("'display_as' must be one of {table,tab,list}") if write: print(out) else: return out def __repr__(self): """Display information about the table.""" return "{} ({}) <{}:{} pheno: {} study: {}".format( self.id, self.snpid, self.chrom, self.pos, self.phenotype_desc, self.study.title) def __int__(self): """Return ID number.""" return self.id def __str__(self): """Return coordinates.""" return self.snp_loc
class Study(Base): """An SQLAlchemy table to store study information. This table provides easy ways to query for SNPs by study information, including population and phenotype. Note: `disc_pop_flag` and `rep_pop_flag` are integer representations of a bitwise flag describing population, defined in ref.PopFlag. To see the string representation of this property, lookup `disc_pops` or `rep_pops`. Table Name: studies Columns: Described in the columns attribute. Attributes: int: The integer ID number, usually the PMID, unless not indexed. str: Summary data on this study. len: The number of individuals in this study. disc_pops: A string displaying the number of discovery poplations. rep_pops: A string displaying the number of replication poplations. columns: A dictionary of all columns 'column_name'=>('type', 'desc') population_information: A multi-line string describing the populations in this study. """ __tablename__ = "studies" id = _Column(_Integer, primary_key=True, index=True) pmid = _Column(_String(100), index=True) title = _Column(_String, index=True) journal = _Column(_String) author = _Column(_String) grasp_ver = _Column(_Integer, index=True) noresults = _Column(_Boolean) results = _Column(_Integer) qtl = _Column(_Boolean) snps = _relationship("SNP", back_populates='study') phenotype_id = _Column(_Integer, _ForeignKey('phenos.id'), index=True) phenotype = _relationship("Phenotype", back_populates="studies") phenotype_cats = _relationship("PhenoCats", secondary=study_pheno_assoc, back_populates="studies") datepub = _Column(_Date) in_nhgri = _Column(_Boolean) locations = _Column(_String) mf = _Column(_Boolean) mf_only = _Column(_Boolean) platforms = _relationship("Platform", secondary=study_plat_assoc, back_populates="studies") snp_count = _Column(_String) imputed = _Column(_Boolean) population_id = _Column(_Integer, _ForeignKey('populations.id'), index=True) population = _relationship("Population", backref="studies") total = _Column(_Integer) total_disc = _Column(_Integer) pop_flag = _Column(_Integer, index=True) # Will hold a bitwise flag disc_pop_flag = _Column(_Integer, index=True) # Will hold a bitwise flag european = _Column(_Integer) african = _Column(_Integer) east_asian = _Column(_Integer) south_asian = _Column(_Integer) hispanic = _Column(_Integer) native = _Column(_Integer) micronesian = _Column(_Integer) arab = _Column(_Integer) mixed = _Column(_Integer) unspecified = _Column(_Integer) filipino = _Column(_Integer) indonesian = _Column(_Integer) total_rep = _Column(_Integer) rep_pop_flag = _Column(_Integer, index=True) # Will hold a bitwise flag rep_european = _Column(_Integer) rep_african = _Column(_Integer) rep_east_asian = _Column(_Integer) rep_south_asian = _Column(_Integer) rep_hispanic = _Column(_Integer) rep_native = _Column(_Integer) rep_micronesian = _Column(_Integer) rep_arab = _Column(_Integer) rep_mixed = _Column(_Integer) rep_unspecified = _Column(_Integer) rep_filipino = _Column(_Integer) rep_indonesian = _Column(_Integer) sample_size = _Column(_String) # Maybe parse this better replication_size = _Column(_String) # Maybe parse this better columns = _od([ ('id', ('Integer', 'id')), ('pmid', ('String', 'PubmedID')), ('title', ('String', 'Study')), ('journal', ('String', 'Journal')), ('author', ('String', '1st_author')), ('grasp_ver', ('Integer', 'GRASPversion?')), ('noresults', ('Boolean', 'No results flag')), ('results', ('Integer', '#results')), ('qtl', ('Boolean', 'IsEqtl/meQTL/pQTL/gQTL/Metabolmics?')), ('snps', ('relationship', 'Link to all SNPs in this study')), ('phenotype_id', ('Integer', 'ID of primary phenotype in Phenotype table')), ('phenotype', ('relationship', 'A link to the primary phenotype in the Phenotype table')), ('phenotype_cats', ('relationship', 'A link to all phenotype categories assigned in the PhenoCats table') ), ('datepub', ('Date', 'DatePub')), ('in_nhgri', ('Boolean', 'In NHGRI GWAS catalog (8/26/14)?')), ('locations', ('String', 'Specific place(s) mentioned for samples')), ('mf', ('Boolean', 'Includes male/female only analyses in discovery and/or replication?' )), ('mf_only', ('Boolean', 'Exclusively male or female study?')), ('platforms', ('relationship', 'Link to platforms in the Platform table. Platform [SNPs passing QC]' )), ('snp_count', ('String', 'From "Platform [SNPs passing QC]"')), ('imputed', ('Boolean', 'From "Platform [SNPs passing QC]"')), ('population_id', ('Integer', 'Primary key of population table')), ('population', ('relationship', 'GWAS description, link to table')), ('total', ('Integer', 'Total Discovery + Replication sample size')), ('total_disc', ('Integer', 'Total discovery samples')), ('pop_flag', ('Integer', 'A bitwise flag that shows presence/absence of all populations (discovery and replication)' )), ('disc_pop_flag', ('Integer', 'A bitwise flag that shows presence/absence of discovery populations' )), ('european', ('Integer', 'European')), ('african', ('Integer', 'African ancestry')), ('east_asian', ('Integer', 'East Asian')), ('south_asian', ('Integer', 'Indian/South Asian')), ('hispanic', ('Integer', 'Hispanic')), ('native', ('Integer', 'Native')), ('micronesian', ('Integer', 'Micronesian')), ('arab', ('Integer', 'Arab/ME')), ('mixed', ('Integer', 'Mixed')), ('unpecified', ('Integer', 'Unspec')), ('filipino', ('Integer', 'Filipino')), ('indonesian', ('Integer', 'Indonesian')), ('total_rep', ('Integer', 'Total replication samples')), ('rep_pop_flag', ('Integer', 'A bitwise flag that shows presence/absence of replication populations' )), ('rep_european', ('Integer', 'European.1')), ('rep_african', ('Integer', 'African ancestry.1')), ('rep_east_asian', ('Integer', 'East Asian.1')), ('rep_south_asian', ('Integer', 'Indian/South Asian.1')), ('rep_hispanic', ('Integer', 'Hispanic.1')), ('rep_native', ('Integer', 'Native.1')), ('rep_micronesian', ('Integer', 'Micronesian.1')), ('rep_arab', ('Integer', 'Arab/ME.1')), ('rep_mixed', ('Integer', 'Mixed.1')), ('rep_unpecified', ('Integer', 'Unspec.1')), ('rep_filipino', ('Integer', 'Filipino.1')), ('rep_indonesian', ('Integer', 'Indonesian.1')), ('sample_size', ('String', 'Initial Sample Size, string description of integer population counts above.' )), ('replication_size', ('String', 'Replication Sample Size, string description of integer population counts above.' )), ]) """A description of all columns in this table.""" @property def disc_pops(self): """Convert disc_pop_flag to PopFlag.""" return _PopFlag(self.disc_pop_flag) @property def rep_pops(self): """Convert rep_pop_flag to PopFlag.""" return _PopFlag(self.rep_pop_flag) @property def pops(self): """Convert rep_pop_flag to PopFlag.""" return _PopFlag(self.pop_flag) @property def population_information(self): """Display a summary of population data.""" outstr = [ "Primary population: {}\n".format(self.population.population), "Individuals: {}\n".format(self.total), "Discovery populations: {}; Total: {}\n".format( self.disc_pops.to_simple_str(), self.total_disc) ] for pop in [ 'european', 'african', 'east_asian', 'south_asian', 'hispanic', 'native', 'micronesian', 'arab', 'unspecified', 'filipino', 'indonesian' ]: outstr.append('\t{}: {}\n'.format(pop, eval('self.' + pop))) outstr.append("Replication populations: {}; Total: {}\n".format( self.rep_pops.to_simple_str(), self.total_rep)) for pop in [ 'european', 'african', 'east_asian', 'south_asian', 'hispanic', 'native', 'micronesian', 'arab', 'unspecified', 'filipino', 'indonesian' ]: outstr.append('\t{}: {}\n'.format(pop, eval('self.rep_' + pop))) def get_columns(self, return_as='list'): """Return all columns in the table nicely formatted. Display choices: list: A python list of column names dictionary: A python dictionary of name=>desc long_dict: A python dictionary of name=>(type, desc) Args: return_as: {table,tab,list,dictionary,long_dict,id_dict} Returns: A list or dictionary """ cols = self.columns if return_as == 'list': return [i[1] for i in cols.values()] elif return_as == 'dictionary': return {k: v[1] for k, v in cols.items()} elif return_as == 'long_dict': return cols else: raise Exception("'display_as' must be one of {table,tab,list}") def display_columns(self, display_as='table', write=False): """Return all columns in the table nicely formatted. Display choices: table: A formatted grid-like table tab: A tab delimited non-formatted version of table list: A string list of column names Args: display_as: {table,tab,list} write: If true, print output to console, otherwise return string. Returns: A formatted string or None """ cols = self.columns if display_as == 'table': out = _tb( [['Column', 'Description', 'Type']] +\ [[k, v[1], v[0]] for k, v in cols.items()], headers='firstrow', tablefmt='grid' ) elif display_as == 'tab': out = '\n'.join( ['\t'.join(['Column', 'Description', 'Type'])] +\ ['\t'.join([k, v[1], v[0]]) for k, v in cols.items()], ) elif display_as == 'list': out = '\n'.join([i[1] for i in cols.values()]) else: raise Exception("'display_as' must be one of {table,tab,list}") if write: print(out) else: return out def __repr__(self): """Display informaertn about this study.""" return '{} <{}:{} "{}" ({}; Pop: {}; Disc Pops: {}; Rep Pops: {})>'.\ format(self.id, self.author, self.journal, self.title, self.phenotype.phenotype, self.population.population, self.disc_pops.to_simple_str(), self.rep_pops.to_simple_str()) def __str__(self): """Display refertnce.""" return "{}: {} ({})\nSNPS: {}\nInds: {}\n".format( self.journal, self.title, self.author, self.snp_count, self.total, ) + "Disc Pops: {}; Rep Pops: {}; EUR: {}; AFR: {}".format( self.disc_pops.to_simple_str, self.rep_pops.to_simple_str, self.european, self.african) def __int__(self): """Return ID number.""" return self.id def __len__(self): """Return total individual count.""" return int(self.total)
# Flags from .ref import PopFlag as _PopFlag __all__ = ["SNP", "Phenotype", "PhenoCats", "Platform", "Population"] Base = _declarative_base() """The SQLAlchemy Base for the database.""" ############################################################################### # Association Tables # ############################################################################### snp_pheno_assoc = _Table( 'snp_pheno_association', Base.metadata, _Column('snp_id', _BigInteger, _ForeignKey('snps.id')), _Column('pheno_id', _Integer, _ForeignKey('pheno_cats.id'))) study_pheno_assoc = _Table( 'study_pheno_association', Base.metadata, _Column('study_id', _Integer, _ForeignKey('studies.id')), _Column('pheno_id', _Integer, _ForeignKey('pheno_cats.id'))) study_plat_assoc = _Table( 'study_platform_association', Base.metadata, _Column('study_id', _Integer, _ForeignKey('studies.id')), _Column('platform_id', _Integer, _ForeignKey('platforms.id'))) ############################################################################### # Database Tables # ###############################################################################
def __new__(cls, t, *args, **kwargs): if 'sqlalchemy' not in t.__module__: t = Enum(t) return _Column(t, *args, **kwargs)
class RiskAllele(Base): """This class holds the SQLAlchemy definition of the risk allele table. Always Populated Columns: id (int): A unique ID pmid (int): The pubmed ID of the study trait (str): A string describing the trait of interest rsID (str): The rsid of the SNP population (str): The population the data is for risk_allele (str): The risk allele Optional Columns: chrom (str): Chromosome string (hg19) position (int): Position on chromosome (hg19) A1 (str): The A1 allele A2 (str): The A2 allele OR (float): The odds ratio for A1 B (float): The beta (effect size) for A1 OR_B (float): Either the OR or B P (float): p-value of the SNP-trait pair N (int): Number of individuals supporting the SNP N_cases (int): Number of of individuals in the case group N_controls (int): Number of of individuals in the control group source (str): Which database this record is from grasp_id (int): The ID of this SNP-trait pair in the GRASP database population_info (str): Extra population information """ __tablename__ = 'risk_alleles' id = _Column(_Integer, primary_key=True) pmid = _Column(_String, _ForeignKey('pmids.pmid'), nullable=False) paper = _relationship('PMID', backref='snps') trait_id = _Column(_Integer, _ForeignKey('traits.id'), nullable=False) trait = _relationship('Trait', backref='snps') rsID = _Column(_String, index=True, nullable=False) population = _Column(_String, index=True, nullable=False) risk_allele = _Column(_String(1), nullable=False) chrom = _Column(_String, index=True) position = _Column(_Integer, index=True) A1 = _Column(_String) A2 = _Column(_String) OR = _Column(_Float, index=True) B = _Column(_Float) OR_B = _Column(_Float) P = _Column(_Float(precision=128), index=True) cases_MAF = _Column(_Float) controls_MAF = _Column(_Float) N = _Column(_Integer) N_cases = _Column(_Integer) N_controls = _Column(_Integer) source = _Column(_String) grasp_id = _Column(_Integer) population_info = _Column(_String) pmid_rsid = _Index('pmid_rsid', pmid, rsID) loc = _Index('loc', chrom, position) # Make pmid, trait, rsID, population, risk_allele unique _Unique(pmid, trait_id, rsID, population, risk_allele) def __repr__(self): """Better display of data.""" out = ( "RiskAllele<id={id}, pmid={pmid}, rsID={rsid}, trait={trait}, " "risk_allele={risk_allele}>") return out.format(id=self.id, pmid=self.pmid, rsid=self.rsID, trait=self.trait.trait, risk_allele=self.risk_allele)