def cufflinks_gene_diff_import(experiment_id, filename): '''This function imports the data from a gene_exp.diff into :class:`~data.models.GeneExperimentData` objects. This function requires a valid experiment_id and a file. ''' import decimal experiment = mRNASeqExperiment.objects.get(pk=experiment_id) new_genes = 0 updated_genes = 0 with open(filename, 'r') as inputfile: for row in csv.DictReader(inputfile, delimiter='\t'): try: datum = GeneExperimentData( experiment=experiment, gene=Gene.objects.get(pk=row['gene']), fold_change=row['log2(fold_change)'], p_value=row['p_value'], q_value=row['q_value'], locus=row['locus'], internal_id=row['test_id'], sample_1=row['sample_1'], sample_2=row['sample_2'], amount_1=row['value_1'], amount_2=row['value_2'], status=row['status'], test_statistic=row['test_stat'], significant=row['significant']) datum.save() updated_genes += 1 except Gene.DoesNotExist: #create a new gene with that name new_gene = Gene(name=row['gene']) new_gene.save() datum = GeneExperimentData( experiment=experiment, gene=new_gene, fold_change=row['log2(fold_change)'], p_value=row['p_value'], q_value=row['q_value'], locus=row['locus'], internal_id=row['test_id'], sample_1=row['sample_1'], sample_2=row['sample_2'], amount_1=row['value_1'], amount_2=row['value_2'], status=row['status'], test_statistic=row['test_stat'], significant=row['significant']) datum.save() new_genes += 1 return "Added %i measurements and created %i new genes." % ( updated_genes + new_genes, new_genes)
def cufflinks_gene_diff_import(experiment_id, filename): '''This function imports the data from a gene_exp.diff into :class:`~data.models.GeneExperimentData` objects. This function requires a valid experiment_id and a file. ''' import decimal experiment = mRNASeqExperiment.objects.get(pk=experiment_id) new_genes = 0 updated_genes =0 with open(filename, 'r') as inputfile: for row in csv.DictReader(inputfile, delimiter='\t'): try: datum = GeneExperimentData( experiment=experiment, gene=Gene.objects.get(pk=row['gene']), fold_change = row['log2(fold_change)'], p_value = row['p_value'], q_value = row['q_value'], locus = row['locus'], internal_id = row['test_id'], sample_1 = row['sample_1'], sample_2 = row['sample_2'], amount_1 = row['value_1'], amount_2 = row['value_2'], status = row['status'], test_statistic = row['test_stat'], significant = row['significant']) datum.save() updated_genes += 1 except Gene.DoesNotExist: #create a new gene with that name new_gene = Gene(name=row['gene']) new_gene.save() datum = GeneExperimentData( experiment=experiment, gene=new_gene, fold_change = row['log2(fold_change)'], p_value = row['p_value'], q_value = row['q_value'], locus = row['locus'], internal_id = row['test_id'], sample_1 = row['sample_1'], sample_2 = row['sample_2'], amount_1 = row['value_1'], amount_2 = row['value_2'], status = row['status'], test_statistic = row['test_stat'], significant = row['significant']) datum.save() new_genes +=1 return "Added %i measurements and created %i new genes." %(updated_genes+new_genes, new_genes)
def test_created_new_gene(self): '''This test that a :class:`~genes.models.Gene` can be created.''' test_gene = Gene(name = "Pikfyve", ensemblID = "ENSMUSG00000025949", chromosome = 1, start = 65186750, end = 65274012, strand = 1, band = "C2", transcript_count = 1, type = "protein_coding", status = "KNOWN") test_gene.save() self.assertEqual(test_gene.pk, "Pikfyve") #presumes no genes loaded in fixture data
def test_created_new_gene(self): """This test that a :class:`~genes.models.Gene` can be created.""" test_gene = Gene( name="Pikfyve", ensemblID="ENSMUSG00000025949", chromosome=1, start=65186750, end=65274012, strand=1, band="C2", transcript_count=1, type="protein_coding", status="KNOWN", ) test_gene.save() self.assertEqual(test_gene.pk, "Pikfyve") # presumes no genes loaded in fixture data
def __init__(self, filename=None): self._genesets = defaultdict(set) self._setnames = {} self._genes = set() if filename: gmtfile = filename for line in gmtfile: tok = line.strip().split('\t') if len(tok) > 3: # tab delim genes column (gsid, name, genes) = tok[0], tok[1], tok[2:] elif (len(tok)) == 3: # space delim genes column #print(tok) (gsid, name, genes) = tok[0], tok[1], tok[2].strip().split(" ") else: break # TODO: probs dont need this self._genesets[gsid] = set(genes) self._setnames[gsid] = name self._genes |= self._genesets[gsid] #create many2many relationships btwn gene, gs, and org for g in genes: g = Gene(entrezid=g) o = Organism(taxonomy_id=9606) gs = Geneset(id=gsid, GStype="GO Biological Process", organism=o, setname=name) gs_membership = Geneset_membership(g, gs) org_membership = OrganismGS(o, gs) print("gs objects= " + gs) g.save() gs.save() o.save() gs_membership.save() org_membership.save()
def handle(self, *args, **options): # Load the organism. tax_id = options.get('taxonomy_id') org = Organism.objects.get(taxonomy_id=tax_id) # geneinfo file information. geneinfo_filename = options.get('geneinfo') symb_col = int(options.get('symbol_col')) syst_col = int(options.get('systematic_col')) alias_col = int(options.get('alias_col')) systematic_xrdb = options.get('systematic_xrdb') # Open the geneinfo file. if geneinfo_filename: geneinfo_fh = open(geneinfo_filename) # yeast has a taxonomy_id that changed, in this case when we look at # the id from NCBI we have to use the new one. gi_tax_id = tax_id if options.get('gi_tax_id'): gi_tax_id = options.get('gi_tax_id') # Get all genes for this organism from the database. entrez_in_db = set( Gene.objects.filter(organism=org).values_list('entrezid', flat=True)) # Get all cross reference pairs that refer to a gene from this # organism. xr_in_db = set() for x in CrossRef.objects.filter( gene__entrezid__in=entrez_in_db).prefetch_related( 'crossrefdb', 'gene'): xr_in_db.add((x.crossrefdb.name, x.xrid, x.gene.entrezid)) if tax_id and geneinfo_fh: # Store all the genes seen thus far so we can remove obsolete # entries. entrez_seen = set() # Store all the crossref pairs seen thus far to avoid duplicates. # Cache of cross reference databases, which saves hits to DB. xrdb_cache = {} # Check to make sure the organism matched so that we don't mass- # delete for no reason. org_matches = 0 entrez_found = 0 # Found from before. entrez_updated = 0 # Found from before and updated. entrez_created = 0 # Didn't exist, added. for line in geneinfo_fh: toks = line.strip().split('\t') if toks[symb_col] == "NEWENTRY": logger.info("NEWENTRY line skipped") continue if not (toks[0] == gi_tax_id): # From wrong organism, skip. continue org_matches += 1 # Count lines that came from this organism. # Grab requested fields from tab delimited file. (entrezid, standard_name, systematic_name, aliases, crossrefs, description, status, chromosome) = (int(toks[1]), toks[symb_col], toks[syst_col], toks[alias_col], toks[5], toks[8], toks[9], toks[6]) # This column only gets filled in for certain organisms. if (not systematic_name) or (systematic_name == '-'): systematic_name = standard_name # Gene is actually mitochondrial, change symbol to avoid # duplicates (analogous to what GeneCards does). if chromosome == "MT": if not systematic_name.startswith('MT'): logger.debug( "Renaming %s to %s, mitochondrial version", systematic_name, "MT-" + systematic_name) systematic_name = "MT-" + systematic_name alias_str = "" alias_num = 0 if aliases and (aliases != '-'): alias_list = [unicode(x) for x in aliases.split('|')] alias_num = len(alias_list) alias_str = ' '.join(alias_list) # Handle cross references. xref_tuples = [] if crossrefs and (crossrefs != '-'): xref_tuples = set() if (systematic_xrdb): xref_tuples.add((unicode(systematic_xrdb), unicode(systematic_name))) xrefs = [unicode(x) for x in crossrefs.split('|')] for x in xrefs: xref_tuples.add(tuple(x.split(':'))) xref_num = len(xref_tuples) # Arbitrary weight for search results. # The principle of weighting is that we think people are more # likely to want a gene that occurs in more databases or has # more aliases b/c it is better-known. This helps break # ordering ties where gene names are identical. weight = 2 * xref_num + alias_num # We also assume that people are much more likely to want # protein coding genes. In the long term we could measure # actual selections and estimate weight per gene. if status == 'protein-coding': weight = weight * 2 gene_object = None entrez_seen.add(entrezid) if entrezid in entrez_in_db: # This existed already. logger.debug("Entrez %s existed already.", entrezid) entrez_found += 1 gene_object = Gene.objects.get(entrezid=entrezid, organism=org) changed = False # The following lines update characteristics that may have # changed. if gene_object.systematic_name != systematic_name: gene_object.systematic_name = systematic_name changed = True if gene_object.standard_name != standard_name: gene_object.standard_name = standard_name changed = True if gene_object.description != description: gene_object.description = description changed = True if gene_object.aliases != alias_str: gene_object.aliases = alias_str changed = True if gene_object.weight != weight: gene_object.weight = weight changed = True # If the gene was marked obsolete but occurs in the # gene_info file, then it's not obsolete. if gene_object.obsolete: gene_object.obsolete = False changed = True if changed: entrez_updated += 1 # To save time, only call save() if something has been # changed. gene_object.save() else: # New entrezid observed. logger.debug( "Entrez %s did not exist and will be created.", entrezid) gene_object = Gene(entrezid=entrezid, organism=org, systematic_name=systematic_name, standard_name=standard_name, description=description, obsolete=False, weight=weight) gene_object.save() entrez_created += 1 # Add crossreferences. for xref_tuple in xref_tuples: try: xrdb = xrdb_cache[xref_tuple[0]] except KeyError: try: xrdb = CrossRefDB.objects.get(name=xref_tuple[0]) except CrossRefDB.DoesNotExist: xrdb = None xrdb_cache[xref_tuple[0]] = xrdb if xrdb is None: # Don't understand crossrefdb, skip. logger.warning( "We encountered an xrdb (%s) not in our" " database for pair %s.", xref_tuple[0], xref_tuple) continue logger.debug('Found crossreference pair %s.', xref_tuple) # If the record doesn't exist in database, create it. if not (xref_tuple[0], xref_tuple[1], entrezid) in xr_in_db: xr_obj = CrossRef(crossrefdb=xrdb, xrid=xref_tuple[1], gene=gene_object) xr_obj.save() # Update "obsolete" attribute for entrez records that are in the # database but not in input file. for id in entrez_in_db: if id not in entrez_seen: gene_object = Gene.objects.get(entrezid=id, organism=org) if not gene_object.obsolete: gene_object.obsolete = True gene_object.save() logger.info( "%s entrez identifiers existed in the database and " "were found in the new gene_info file", entrez_found) logger.info( "%s entrez identifiers existed in the database and " "were changed in the new gene_info file", entrez_updated) logger.info( "%s entrez identifiers did not exist and were created" "in the new gene_info file", entrez_created) if org_matches < 10: logger.error('Less than ten matches were encountered for ' 'this organism. Check the organism ID.') sys.exit(1) else: logger.error('Couldn\'t load geneinfo %s for org %s.', options.get('geneinfo'), tax_id, exc_info=sys.exc_info(), extra={'options': options})
def setUp(self): org = factory.create(Organism) xrdb1 = CrossRefDB(name="ASDF", url="http://www.example.com") xrdb1.save() xrdb2 = CrossRefDB(name="XRDB2", url="http://www.example.com/2") xrdb2.save() # g1 and g2 have both standard and systematic names. g1 = Gene(entrezid=1, systematic_name="g1", standard_name="G1", description="asdf", organism=org, aliases="gee1 GEE1") g1.save() g2 = Gene(entrezid=2, systematic_name="g2", standard_name="G2", description="asdf", organism=org, aliases="gee2 GEE2") g2.save() xref1 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRID1") xref1.save() xref2 = CrossRef(crossrefdb=xrdb2, gene=g2, xrid="XRID1") xref2.save() xref3 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRRID1") xref3.save() xref4 = CrossRef(crossrefdb=xrdb1, gene=g2, xrid="XRID2") xref4.save() org2 = Organism(taxonomy_id=1234, common_name="Computer mouse", scientific_name="Mus computurus", slug="mus-computurus") org2.save() org3 = Organism(taxonomy_id=4321, common_name="Computer screen", scientific_name="Monitorus computurus", slug="monitorus-computurus") org3.save() # Make systematic and standard name the same for the following genes, # but make organisms different. Skip entrezid 3 since that is used by # other tests. g4 = Gene(entrezid=4, systematic_name="acdc", standard_name="ACDC", description="asdf", organism=org2, aliases="gee4 GEE4") g4.save() g5 = Gene(entrezid=5, systematic_name="acdc", standard_name="ACDC", description="asdf", organism=org3, aliases="gee5 GEE5") g5.save() # g101 has standard name, but no systematic name. g101 = Gene(entrezid=101, standard_name="std_101", organism=org2) g101.save() # g102 has systematic name, but no standard name. g102 = Gene(entrezid=102, systematic_name="sys_102", organism=org2) g102.save()