def clean_genome(self): try: genome = entrez_utils.get_genome( self.cleaned_data['genome'].strip()) except entrez_utils.EntrezException: raise forms.ValidationError("Invalid RefSeq accession number.") return genome
def fill_gene_type_field(): """Populates the gene_type field for all genes.""" for genome in models.Genome.objects.all(): print genome.genome_accession rec = entrez_utils.get_genome(genome.genome_accession) genes = entrez_utils.get_genes(rec) gene_dict = {gene['locus_tag']: gene['gene_type'] for gene in genes} for gene in models.Gene.objects.filter(genome=genome).all(): if gene.locus_tag in gene_dict: gene.gene_type = gene_dict[gene.locus_tag] gene.save() else: print "Missing locus tag", gene.locus_tag
def clean_genome_accession_helper(self, genome_accession): """Checks if the genome accession field is valid. If the genome accession is valid and not in the database, downloads genome sequences and list of genes from NCBI database and adds them to the CollecTF database. Gets called for all genome accesion fields. """ genome_accession = genome_accession.strip() if '.' not in genome_accession: raise forms.ValidationError(""" Please enter RefSeq accession number with the version number.""") if not genome_accession.startswith('NC_'): raise forms.ValidationError( "RefSeq genome accession number should start with 'NC_'") try: Genome.objects.get(genome_accession=genome_accession) except Genome.DoesNotExist: # Get genome record try: record = entrez_utils.get_genome(genome_accession) except entrez_utils.EntrezException: raise forms.ValidationError(""" Can not fetch genome record from NCBI. Check accession number. """) # Get taxonomy record try: entrez_utils.get_organism_taxon(record) except entrez_utils.EntrezException: raise forms.ValidationError( "Can not fetch strain taxonomy information.") # Get genes try: genes = entrez_utils.get_genes(record) except entrez_utils.EntrezException: raise forms.ValidationError(""" Can't retrieve list of genes. Check genome accession number.""") # Create genome object and genes. species_taxon = new_taxonomy(record) new_genome(record, genes, species_taxon) return genome_accession
def batch_refseq_accession(genome_accession): """Retrieves product NP/YP/WP accession numbers of the given genome. Returns dictionary {locus_tag: RefSeq_acc}. """ print "Downloading", genome_accession genome_rec = entrez_utils.get_genome(genome_accession) genes = entrez_utils.get_genes(genome_rec) gene_dict = {} for gene in genes: if gene['protein_id'] and len(gene['protein_id']) == 1: protein_id = gene['protein_id'][0] #.split('.')[0] gene_dict[gene['locus_tag']] = protein_id for old_locus_tag in gene['old_locus_tag']: gene_dict[old_locus_tag] = protein_id return gene_dict
def batch_refseq_accession(genome_accession): """Retrieves product NP/YP/WP accession numbers of the given genome. Returns dictionary {locus_tag: RefSeq_acc}. """ print "Downloading", genome_accession genome_rec = entrez_utils.get_genome(genome_accession) genes = entrez_utils.get_genes(genome_rec) gene_dict = {} for gene in genes: if gene['protein_id'] and len(gene['protein_id']) == 1: protein_id = gene['protein_id'][0]#.split('.')[0] gene_dict[gene['locus_tag']] = protein_id for old_locus_tag in gene['old_locus_tag']: gene_dict[old_locus_tag] = protein_id return gene_dict