Exemple #1
0
    def test_empty_standard_and_systematic_names(self):
        """
        Test that a ValueError exception will be raised when we try to create a
        gene whose standard and systematic names are both empty or null, or
        ONLY consist of space characters (such as space, tab, new line, etc).
        """
        org = factory.create(Organism)

        # Neither standard_name nor systematic_name is set explicitly.
        unnamed_gene = Gene(entrezid=999, organism=org)
        self.assertRaises(ValueError, unnamed_gene.save)

        # standard_name consists of only space characters.
        # systematic_name is u'' here, because it is not set explicitly, and
        # by default "null=False" for this field in the model.
        unnamed_gene = Gene(entrezid=999, standard_name="\t  \n", organism=org)
        self.assertRaises(ValueError, unnamed_gene.save)

        # Both standard_name and systematic_name are empty strings.
        unnamed_gene = Gene(entrezid=999,
                            standard_name="",
                            systematic_name="",
                            organism=org)
        self.assertRaises(ValueError, unnamed_gene.save)

        # Both standard_name and systematic_name consist of space characters
        # only.
        unnamed_gene = Gene(entrezid=999,
                            standard_name="  ",
                            systematic_name="\t  \n ",
                            organism=org)
        self.assertRaises(ValueError, unnamed_gene.save)
def cufflinks_gene_diff_import(experiment_id, filename):
    '''This function imports the data from a gene_exp.diff into :class:`~data.models.GeneExperimentData` objects.
    
    This function requires a valid experiment_id and a file.
    '''

    import decimal

    experiment = mRNASeqExperiment.objects.get(pk=experiment_id)
    new_genes = 0
    updated_genes = 0
    with open(filename, 'r') as inputfile:
        for row in csv.DictReader(inputfile, delimiter='\t'):
            try:
                datum = GeneExperimentData(
                    experiment=experiment,
                    gene=Gene.objects.get(pk=row['gene']),
                    fold_change=row['log2(fold_change)'],
                    p_value=row['p_value'],
                    q_value=row['q_value'],
                    locus=row['locus'],
                    internal_id=row['test_id'],
                    sample_1=row['sample_1'],
                    sample_2=row['sample_2'],
                    amount_1=row['value_1'],
                    amount_2=row['value_2'],
                    status=row['status'],
                    test_statistic=row['test_stat'],
                    significant=row['significant'])
                datum.save()
                updated_genes += 1
            except Gene.DoesNotExist:
                #create a new gene with that name
                new_gene = Gene(name=row['gene'])
                new_gene.save()
                datum = GeneExperimentData(
                    experiment=experiment,
                    gene=new_gene,
                    fold_change=row['log2(fold_change)'],
                    p_value=row['p_value'],
                    q_value=row['q_value'],
                    locus=row['locus'],
                    internal_id=row['test_id'],
                    sample_1=row['sample_1'],
                    sample_2=row['sample_2'],
                    amount_1=row['value_1'],
                    amount_2=row['value_2'],
                    status=row['status'],
                    test_statistic=row['test_stat'],
                    significant=row['significant'])
                datum.save()
                new_genes += 1
    return "Added %i measurements and created %i new genes." % (
        updated_genes + new_genes, new_genes)
def cufflinks_gene_diff_import(experiment_id, filename):
    '''This function imports the data from a gene_exp.diff into :class:`~data.models.GeneExperimentData` objects.
    
    This function requires a valid experiment_id and a file.
    '''
    
    import decimal
    
    experiment = mRNASeqExperiment.objects.get(pk=experiment_id)
    new_genes = 0
    updated_genes =0
    with open(filename, 'r') as inputfile:
        for row in csv.DictReader(inputfile, delimiter='\t'):
            try: 
                datum = GeneExperimentData(
                     experiment=experiment,
        			 gene=Gene.objects.get(pk=row['gene']),
        			 fold_change = row['log2(fold_change)'],
        			 p_value = row['p_value'],
        			 q_value = row['q_value'],
        			 locus = row['locus'],
        			 internal_id = row['test_id'],
        			 sample_1 = row['sample_1'],
        			 sample_2 = row['sample_2'],
        			 amount_1 = row['value_1'],
        			 amount_2 = row['value_2'],
        			 status = row['status'],
        			 test_statistic = row['test_stat'],
        			 significant = row['significant'])
                datum.save()
                updated_genes += 1
            except Gene.DoesNotExist:
                #create a new gene with that name
                new_gene = Gene(name=row['gene'])
                new_gene.save()
                datum = GeneExperimentData(
                     experiment=experiment,
        			 gene=new_gene,
        			 fold_change = row['log2(fold_change)'],
        			 p_value = row['p_value'],
        			 q_value = row['q_value'],
        			 locus = row['locus'],
        			 internal_id = row['test_id'],
        			 sample_1 = row['sample_1'],
        			 sample_2 = row['sample_2'],
        			 amount_1 = row['value_1'],
        			 amount_2 = row['value_2'],
        			 status = row['status'],
        			 test_statistic = row['test_stat'],
        			 significant = row['significant'])
                datum.save()
                new_genes +=1
    return "Added %i measurements and created %i new genes." %(updated_genes+new_genes, new_genes)              
                
 def test_researcher_absolute_url(self):
     '''This tests that the absolute url of a object is **/gene/<name>**.'''
     
     test_gene = Gene(name = "Pikfyve",
     	ensemblID = "ENSMUSG00000025949",
     	chromosome = 1,
     	start = 65186750,
     	end = 65274012,
     	strand = 1,
     	band = "C2",
     	transcript_count = 1,
     	type = "protein_coding",
     	status = "KNOWN")
     self.assertEqual(test_gene.get_absolute_url(), "/gene/Pikfyve")  
 def test_gene_unicode(self):
     '''This tests that the unicode representation of a :class:`~genes.models.Gene` is set as its name.'''               
     
     test_gene = Gene(name = "Pikfyve",
     	ensemblID = "ENSMUSG00000025949",
     	chromosome = 1,
     	start = 65186750,
     	end = 65274012,
     	strand = 1,
     	band = "C2",
     	transcript_count = 1,
     	type = "protein_coding",
     	status = "KNOWN")
     self.assertEqual(test_gene.__unicode__(), "Pikfyve") 
 def test_created_new_gene(self):
     '''This test that a :class:`~genes.models.Gene` can be created.'''
      
     test_gene = Gene(name = "Pikfyve",
     	ensemblID = "ENSMUSG00000025949",
     	chromosome = 1,
     	start = 65186750,
     	end = 65274012,
     	strand = 1,
     	band = "C2",
     	transcript_count = 1,
     	type = "protein_coding",
     	status = "KNOWN")
     test_gene.save()
     self.assertEqual(test_gene.pk, "Pikfyve") #presumes no genes loaded in fixture data
    def test_researcher_absolute_url(self):
        """This tests that the absolute url of a object is **/gene/<name>**."""

        test_gene = Gene(
            name="Pikfyve",
            ensemblID="ENSMUSG00000025949",
            chromosome=1,
            start=65186750,
            end=65274012,
            strand=1,
            band="C2",
            transcript_count=1,
            type="protein_coding",
            status="KNOWN",
        )
        self.assertEqual(test_gene.get_absolute_url(), "/gene/Pikfyve")
    def test_gene_unicode(self):
        """This tests that the unicode representation of a :class:`~genes.models.Gene` is set as its name."""

        test_gene = Gene(
            name="Pikfyve",
            ensemblID="ENSMUSG00000025949",
            chromosome=1,
            start=65186750,
            end=65274012,
            strand=1,
            band="C2",
            transcript_count=1,
            type="protein_coding",
            status="KNOWN",
        )
        self.assertEqual(test_gene.__unicode__(), "Pikfyve")
    def test_created_new_gene(self):
        """This test that a :class:`~genes.models.Gene` can be created."""

        test_gene = Gene(
            name="Pikfyve",
            ensemblID="ENSMUSG00000025949",
            chromosome=1,
            start=65186750,
            end=65274012,
            strand=1,
            band="C2",
            transcript_count=1,
            type="protein_coding",
            status="KNOWN",
        )
        test_gene.save()
        self.assertEqual(test_gene.pk, "Pikfyve")  # presumes no genes loaded in fixture data
Exemple #10
0
    def __init__(self, filename=None):
        self._genesets = defaultdict(set)
        self._setnames = {}
        self._genes = set()

        if filename:
            gmtfile = filename

            for line in gmtfile:
                tok = line.strip().split('\t')
                if len(tok) > 3:  # tab delim genes column
                    (gsid, name, genes) = tok[0], tok[1], tok[2:]
                elif (len(tok)) == 3:  # space delim genes column
                    #print(tok)
                    (gsid, name,
                     genes) = tok[0], tok[1], tok[2].strip().split(" ")
                else:
                    break
                # TODO: probs dont need this
                self._genesets[gsid] = set(genes)
                self._setnames[gsid] = name
                self._genes |= self._genesets[gsid]

                #create many2many relationships btwn gene, gs, and org
                for g in genes:
                    g = Gene(entrezid=g)
                    o = Organism(taxonomy_id=9606)
                    gs = Geneset(id=gsid,
                                 GStype="GO Biological Process",
                                 organism=o,
                                 setname=name)
                    gs_membership = Geneset_membership(g, gs)
                    org_membership = OrganismGS(o, gs)

                    print("gs objects= " + gs)
                    g.save()
                    gs.save()
                    o.save()
                    gs_membership.save()
                    org_membership.save()
    def handle(self, *args, **options):
        # Load the organism.
        tax_id = options.get('taxonomy_id')
        org = Organism.objects.get(taxonomy_id=tax_id)

        # geneinfo file information.
        geneinfo_filename = options.get('geneinfo')
        symb_col = int(options.get('symbol_col'))
        syst_col = int(options.get('systematic_col'))
        alias_col = int(options.get('alias_col'))
        systematic_xrdb = options.get('systematic_xrdb')

        # Open the geneinfo file.
        if geneinfo_filename:
            geneinfo_fh = open(geneinfo_filename)

        # yeast has a taxonomy_id that changed, in this case when we look at
        # the id from NCBI we have to use the new one.
        gi_tax_id = tax_id
        if options.get('gi_tax_id'):
            gi_tax_id = options.get('gi_tax_id')

        # Get all genes for this organism from the database.
        entrez_in_db = set(
            Gene.objects.filter(organism=org).values_list('entrezid',
                                                          flat=True))

        # Get all cross reference pairs that refer to a gene from this
        # organism.
        xr_in_db = set()
        for x in CrossRef.objects.filter(
                gene__entrezid__in=entrez_in_db).prefetch_related(
                    'crossrefdb', 'gene'):
            xr_in_db.add((x.crossrefdb.name, x.xrid, x.gene.entrezid))

        if tax_id and geneinfo_fh:
            # Store all the genes seen thus far so we can remove obsolete
            # entries.
            entrez_seen = set()
            # Store all the crossref pairs seen thus far to avoid duplicates.
            # Cache of cross reference databases, which saves hits to DB.
            xrdb_cache = {}
            # Check to make sure the organism matched so that we don't mass-
            # delete for no reason.
            org_matches = 0
            entrez_found = 0  # Found from before.
            entrez_updated = 0  # Found from before and updated.
            entrez_created = 0  # Didn't exist, added.
            for line in geneinfo_fh:
                toks = line.strip().split('\t')
                if toks[symb_col] == "NEWENTRY":
                    logger.info("NEWENTRY line skipped")
                    continue

                if not (toks[0] == gi_tax_id):  # From wrong organism, skip.
                    continue

                org_matches += 1  # Count lines that came from this organism.
                # Grab requested fields from tab delimited file.
                (entrezid, standard_name, systematic_name, aliases, crossrefs,
                 description, status,
                 chromosome) = (int(toks[1]), toks[symb_col], toks[syst_col],
                                toks[alias_col], toks[5], toks[8], toks[9],
                                toks[6])

                # This column only gets filled in for certain organisms.
                if (not systematic_name) or (systematic_name == '-'):
                    systematic_name = standard_name
                # Gene is actually mitochondrial, change symbol to avoid
                # duplicates (analogous to what GeneCards does).
                if chromosome == "MT":
                    if not systematic_name.startswith('MT'):
                        logger.debug(
                            "Renaming %s to %s, mitochondrial version",
                            systematic_name, "MT-" + systematic_name)
                        systematic_name = "MT-" + systematic_name

                alias_str = ""
                alias_num = 0
                if aliases and (aliases != '-'):
                    alias_list = [unicode(x) for x in aliases.split('|')]
                    alias_num = len(alias_list)
                    alias_str = ' '.join(alias_list)

                # Handle cross references.
                xref_tuples = []
                if crossrefs and (crossrefs != '-'):
                    xref_tuples = set()
                    if (systematic_xrdb):
                        xref_tuples.add((unicode(systematic_xrdb),
                                         unicode(systematic_name)))

                    xrefs = [unicode(x) for x in crossrefs.split('|')]
                    for x in xrefs:
                        xref_tuples.add(tuple(x.split(':')))

                xref_num = len(xref_tuples)
                # Arbitrary weight for search results.
                # The principle of weighting is that we think people are more
                # likely to want a gene that occurs in more databases or has
                # more aliases b/c it is better-known.  This helps break
                # ordering ties where gene names are identical.
                weight = 2 * xref_num + alias_num

                # We also assume that people are much more likely to want
                # protein coding genes.  In the long term we could measure
                # actual selections and estimate weight per gene.
                if status == 'protein-coding':
                    weight = weight * 2

                gene_object = None
                entrez_seen.add(entrezid)
                if entrezid in entrez_in_db:  # This existed already.
                    logger.debug("Entrez %s existed already.", entrezid)
                    entrez_found += 1
                    gene_object = Gene.objects.get(entrezid=entrezid,
                                                   organism=org)
                    changed = False
                    # The following lines update characteristics that may have
                    # changed.
                    if gene_object.systematic_name != systematic_name:
                        gene_object.systematic_name = systematic_name
                        changed = True
                    if gene_object.standard_name != standard_name:
                        gene_object.standard_name = standard_name
                        changed = True
                    if gene_object.description != description:
                        gene_object.description = description
                        changed = True
                    if gene_object.aliases != alias_str:
                        gene_object.aliases = alias_str
                        changed = True
                    if gene_object.weight != weight:
                        gene_object.weight = weight
                        changed = True
                    # If the gene was marked obsolete but occurs in the
                    # gene_info file, then it's not obsolete.
                    if gene_object.obsolete:
                        gene_object.obsolete = False
                        changed = True
                    if changed:
                        entrez_updated += 1
                        # To save time, only call save() if something has been
                        # changed.
                        gene_object.save()

                else:  # New entrezid observed.
                    logger.debug(
                        "Entrez %s did not exist and will be created.",
                        entrezid)
                    gene_object = Gene(entrezid=entrezid,
                                       organism=org,
                                       systematic_name=systematic_name,
                                       standard_name=standard_name,
                                       description=description,
                                       obsolete=False,
                                       weight=weight)
                    gene_object.save()
                    entrez_created += 1

                # Add crossreferences.
                for xref_tuple in xref_tuples:
                    try:
                        xrdb = xrdb_cache[xref_tuple[0]]
                    except KeyError:
                        try:
                            xrdb = CrossRefDB.objects.get(name=xref_tuple[0])
                        except CrossRefDB.DoesNotExist:
                            xrdb = None
                        xrdb_cache[xref_tuple[0]] = xrdb
                    if xrdb is None:  # Don't understand crossrefdb, skip.
                        logger.warning(
                            "We encountered an xrdb (%s) not in our"
                            " database for pair %s.", xref_tuple[0],
                            xref_tuple)
                        continue
                    logger.debug('Found crossreference pair %s.', xref_tuple)
                    # If the record doesn't exist in database, create it.
                    if not (xref_tuple[0], xref_tuple[1],
                            entrezid) in xr_in_db:
                        xr_obj = CrossRef(crossrefdb=xrdb,
                                          xrid=xref_tuple[1],
                                          gene=gene_object)
                        xr_obj.save()

            # Update "obsolete" attribute for entrez records that are in the
            # database but not in input file.
            for id in entrez_in_db:
                if id not in entrez_seen:
                    gene_object = Gene.objects.get(entrezid=id, organism=org)
                    if not gene_object.obsolete:
                        gene_object.obsolete = True
                        gene_object.save()

            logger.info(
                "%s entrez identifiers existed in the database and "
                "were found in the new gene_info file", entrez_found)
            logger.info(
                "%s entrez identifiers existed in the database and "
                "were changed in the new gene_info file", entrez_updated)
            logger.info(
                "%s entrez identifiers did not exist and were created"
                "in the new gene_info file", entrez_created)
            if org_matches < 10:
                logger.error('Less than ten matches were encountered for '
                             'this organism.  Check the organism ID.')
                sys.exit(1)
        else:
            logger.error('Couldn\'t load geneinfo %s for org %s.',
                         options.get('geneinfo'),
                         tax_id,
                         exc_info=sys.exc_info(),
                         extra={'options': options})
Exemple #12
0
    def setUp(self):
        org = factory.create(Organism)
        xrdb1 = CrossRefDB(name="ASDF", url="http://www.example.com")
        xrdb1.save()
        xrdb2 = CrossRefDB(name="XRDB2", url="http://www.example.com/2")
        xrdb2.save()

        # g1 and g2 have both standard and systematic names.
        g1 = Gene(entrezid=1,
                  systematic_name="g1",
                  standard_name="G1",
                  description="asdf",
                  organism=org,
                  aliases="gee1 GEE1")
        g1.save()
        g2 = Gene(entrezid=2,
                  systematic_name="g2",
                  standard_name="G2",
                  description="asdf",
                  organism=org,
                  aliases="gee2 GEE2")
        g2.save()

        xref1 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRID1")
        xref1.save()
        xref2 = CrossRef(crossrefdb=xrdb2, gene=g2, xrid="XRID1")
        xref2.save()
        xref3 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRRID1")
        xref3.save()
        xref4 = CrossRef(crossrefdb=xrdb1, gene=g2, xrid="XRID2")
        xref4.save()

        org2 = Organism(taxonomy_id=1234,
                        common_name="Computer mouse",
                        scientific_name="Mus computurus",
                        slug="mus-computurus")
        org2.save()
        org3 = Organism(taxonomy_id=4321,
                        common_name="Computer screen",
                        scientific_name="Monitorus computurus",
                        slug="monitorus-computurus")
        org3.save()

        # Make systematic and standard name the same for the following genes,
        # but make organisms different. Skip entrezid 3 since that is used by
        # other tests.
        g4 = Gene(entrezid=4,
                  systematic_name="acdc",
                  standard_name="ACDC",
                  description="asdf",
                  organism=org2,
                  aliases="gee4 GEE4")
        g4.save()
        g5 = Gene(entrezid=5,
                  systematic_name="acdc",
                  standard_name="ACDC",
                  description="asdf",
                  organism=org3,
                  aliases="gee5 GEE5")
        g5.save()

        # g101 has standard name, but no systematic name.
        g101 = Gene(entrezid=101, standard_name="std_101", organism=org2)
        g101.save()

        # g102 has systematic name, but no standard name.
        g102 = Gene(entrezid=102, systematic_name="sys_102", organism=org2)
        g102.save()