def handle(self, *args, **options): user_name = options.get('user') user = None try: user = User.objects.get(username=user_name) except User.DoesNotExist: logger.error('The user %s did not exist.', user_name, extra={'options': options}) sys.exit() org = None try: org = Organism.objects.get( scientific_name='H**o sapiens') #Only exists for human except Organism.DoesNotExist: logger.error('The organism %s did not exist.', 'H**o sapiens', extra={'options': options}) sys.exit() obo_r = requests.get(DO_URL) obo_strio = StringIO(obo_r.text) disease_ontology = go() loaded_obo = disease_ontology.parse(obo_strio) doid_omim = {} obo_reversed_str_array = obo_r.text.splitlines()[::-1] while obo_reversed_str_array: #Loop from Dima @ Princeton line = obo_reversed_str_array.pop() if line == '[Term]': while line != '': line = obo_reversed_str_array.pop() if line.startswith('id:'): doid = re.search('DOID:[0-9]+', line).group(0) if line.startswith('xref: OMIM:'): omim = re.search('[0-9]+', line).group(0) if not doid_omim.has_key(doid): doid_omim[doid] = set() if omim not in doid_omim[doid]: doid_omim[doid].add(omim) mim_gene = {} s = requests.Session() mim2gene_list = s.retr( OMIM_FTP + 'mim2gene.txt', auth=("anonymous", "*****@*****.**")).text.splitlines() for line in mim2gene_list: #Loop from Dima @ Princeton toks = line.split('\t') mim = toks[0] gtype = toks[1] gid = toks[2] if gtype in LIMIT_TYPE: if mim in mim_gene: logger.warning("MIM already exists: %s", mim) mim_gene[mim] = gid mimdiseases = {} genemap_list = s.retr( OMIM_FTP + "genemap", auth=("anonymous", "*****@*****.**")).text.splitlines() for l in genemap_list: #Loop from Dima @ Princeton #The choice of fields relies on info from the genemap.key file from omim l_split = l.split('|') status = l_split[6].strip() mim_geneid = l_split[9].strip() disorders = l_split[13].strip() #continuation of disorder field d2 = l_split[14].strip() d3 = l_split[15].strip() if d2 != '': disorders = disorders + ' ' + d2 if d3 != '': disorders = disorders + ' ' + d3 if disorders != '' and status in LIMIT_STATUS and mim_gene.has_key( mim_geneid): #print 'Status ok, not blank and genemap has key' geneid = mim_gene[mim_geneid] tuple_gid_status = (geneid, status) disorders_list = disorders.split(';') for d in disorders_list: if '[' not in d and '?' not in d: mim_info = re.search(FIND_MIMID, d) if mim_info: #print 'Has necessary info' #TODO: Make sure to include ? and [ info_split = mim_info.group(0).split(' ') mim_disease_id = info_split[1].strip() mim_phetype = info_split[2].strip() if mim_phetype == LIMIT_PHENO: #print 'Correct phenotype' if not mimdiseases.has_key(mim_disease_id): mimdiseases[mim_disease_id] = mim_disease() mimdiseases[ mim_disease_id].mimid = mim_disease_id mimdiseases[ mim_disease_id].phe_mm = mim_phetype if '{' in d: mimdiseases[ mim_disease_id].is_susceptibility = 1 if tuple_gid_status not in mimdiseases[ mim_disease_id].genetuples: mimdiseases[ mim_disease_id].genetuples.append( tuple_gid_status) logger.debug(disease_ontology.go_terms) entrez_gid = {} for doid in doid_omim.keys(): term = disease_ontology.get_term(doid) if term is None: continue logger.info("Processing %s", term) omim_list = doid_omim[doid] for o in omim_list: omim_id = o if mimdiseases.has_key(omim_id): mim_entry = mimdiseases[omim_id] if mim_entry.is_susceptibility: d_or_s = 'S' else: d_or_s = 'D' for g in mim_entry.genetuples: entrez = int(g[0]) if entrez in entrez_gid: term.add_annotation(gid=entrez_gid[entrez], ref=None) else: gene = Gene.objects.get(entrezid=entrez) entrez_gid[entrez] = gene.id term.add_annotation(gid=gene.id, ref=None) disease_ontology.populated = True #mark annotated disease_ontology.propagate() #prop annotations for (term_id, term) in disease_ontology.go_terms.iteritems(): if term.annotations: logger.info("Creating %s", term) slug = slugify(' '.join( (term.go_id, org.scientific_name, term.full_name)))[:50] #make first 50 chars into a slug doid = term.go_id do_num = doid.split(':')[1] #construct title title = 'DO' + '-' + do_num + ':' + term.full_name #construct abstract #write evidence as string omim_clause = '' if doid in doid_omim: omim_list = list(doid_omim[doid]) if len(omim_list): omim_clause = ' Annotations directly to this term are provided by the OMIM disease ID' if len(omim_list) == 1: omim_clause = omim_clause + ' ' + omim_list[0] else: omim_clause = omim_clause + 's ' + ', '.join( omim_list[:-1]) + ' and ' + omim_list[-1] omim_clause = omim_clause + '.' description = '' if term.description: description += term.description description += ' Annotations from child terms in the disease ontology are propagated through transitive closure.' + omim_clause logger.info(description) #get geneset changed = False try: gs_obj = Geneset.objects.get(slug=slug, creator=user) changed = False #flag to know if we need to call save #all these genesets should be public if not gs_obj.public: gs_obj.public = True changed = True if gs_obj.title != title: gs_obj.title = title changed = True if gs_obj.abstract != description: gs_obj.abstract = description changed = True except Geneset.DoesNotExist: gs_obj = Geneset(title=title, slug=slug, creator=user, organism=org, public=True, abstract=description) changed = True #if anything changed if changed: gs_obj.save() #load annotations most_recent_versions = Version.objects.filter( geneset=gs_obj).order_by('-commit_date')[:1] annots = set([(annotation.gid, annotation.ref) for annotation in term.annotations]) description = '' most_recent_version = None if most_recent_versions: most_recent_version = most_recent_versions[0] if (most_recent_version.commit_date > timezone.now()): logger.error('Version from the future: %s.', most_recent_version) new = annots - most_recent_version.annotations removed = most_recent_version.annotations - annots if (new or removed): description = description + 'Added ' + str( len(new) ) + ' and removed ' + str( len(removed) ) + ' annotations from OMIM and the disease ontology.' else: description = 'Created with ' + str( len(annots) ) + ' annotations from OMIM and the disease ontology.' if description: v_obj = Version(geneset=gs_obj, creator=user, parent=most_recent_version, commit_date=timezone.now()) v_obj.description = description v_obj.annotations = annots v_obj.save()
def handle(self, *args, **options): user_name = options.get('user') user = None try: user = User.objects.get(username = user_name) except User.DoesNotExist: logger.error('The user %s did not exist.', user_name, extra={'options': options}) sys.exit() org = None try: org = Organism.objects.get(scientific_name = options.get('organism')) except Organism.DoesNotExist: logger.error('The organism %s did not exist.', options.get('organism'), extra={'options': options}) sys.exit() version = get_kegg_version(KEGG_URL_BASE) if version is None: logger.error('The KEGG api may have changed. Release no longer starts with "Release".') else: logger.info('Working with KEGG version %s.', version) if (options.get('kegg_record_types')): kegg_record_types = (options.get('kegg_record_types')).replace(" ", "").split(",") kegg_record_types = tuple(kegg_record_types) logger.info('Requested KEGG Record Types are: %s', str(kegg_record_types)) else: kegg_record_types = KEGG_RECORD_TYPES logger.info('Using pre-set KEGG Record Types') for record_type in kegg_record_types: logger.info('Starting record type %s.', record_type) record_members = get_kegg_members(KEGG_URL_BASE, KEGG_NAMES[org.scientific_name], record_type) for (record, members) in record_members.iteritems(): if record_type == 'Module': record = record.split('_').pop() #for modules, they are prefixed with species_ slug = slugify(org.scientific_name + ' ' + record) gs_info = get_kegg_info(KEGG_URL_BASE, record) gs_info['title'] = 'KEGG-' + record_type + '-' + record + ': ' + gs_info['title'] #make title more search friendly try: geneset = Geneset.objects.get(slug=slug) changed = False if geneset.title != gs_info['title']: geneset.title = gs_info['title'] changed = True if geneset.abstract != gs_info['abstract']: geneset.abstract = gs_info['abstract'] changed = True if changed: geneset.save() except Geneset.DoesNotExist: geneset = Geneset(creator=user, title=gs_info['title'], organism=org, abstract=gs_info['abstract'], slug=slug, public=True) geneset.save() if (options.get('gene_id')): gene_id = options.get('gene_id') if (gene_id == 'systematic_name'): annots = frozenset([(gene.pk, None) for gene in Gene.objects.filter(systematic_name__in=members)]) elif (gene_id == 'standard_name'): annots = frozenset([(gene.pk, None) for gene in Gene.objects.filter(standard_name__in=members)]) else: logger.error('gene_id entered is not supported (yet)') return False else: annots = frozenset([(gene.pk, None) for gene in Gene.objects.filter(entrezid__in=members)]) most_recent_versions = Version.objects.filter(geneset=geneset).order_by('-commit_date')[:1] description = '' most_recent_version = None if most_recent_versions: most_recent_version = most_recent_versions[0] if (most_recent_version.commit_date > timezone.now()): logger.error('Version from the future: %s.', most_recent_version) new = annots - most_recent_version.annotations removed = most_recent_version.annotations - annots if (new or removed): description = description + 'Added ' + str(len(new)) + ' and removed ' + str(len(removed)) + ' annotations from KEGG version ' + version + '.' else: description = 'Created with ' + str(len(annots)) + ' annotations from KEGG version ' + version + '.' if description: v_obj = Version(geneset=geneset, creator=user, parent=most_recent_version, commit_date=timezone.now()) v_obj.description = description v_obj.annotations = annots v_obj.save()
def handle(self, *args, **options): user_name = options.get('user') user = None try: user = User.objects.get(username=user_name) except User.DoesNotExist: logger.error('The user %s did not exist.', user_name, extra={'options': options}) sys.exit() org = None try: org = Organism.objects.get(scientific_name=options.get('organism')) except Organism.DoesNotExist: logger.error('The organism %s did not exist.', options.get('organism'), extra={'options': options}) sys.exit() accepted_evcodes = None if options.get('evcodes'): accepted_evcodes = set(options.get('evcodes').split(',')) gene_ontology = go() remote = options.get('remote') != None obo_location = GO_OBO_URL if remote else options.get('obo') loaded_obo = gene_ontology.load_obo(obo_location, remote_location=remote, timeout=5) if not loaded_obo: logger.error("Couldn't load OBO file %s with remote equal to %s.", obo_location, remote) sys.exit() annot_zip_fh = None annot_fh = None if remote: annot_zip_fh = urllib2.urlopen(GO_ASSOC_FTP + '.'.join( (GO_ASSOC_PREFIX, GO_NAMES[org.scientific_name], GO_ASSOC_SUFFIX)), timeout=5) else: annot_zip_fh = open(options.get('annot')) annot_fh = gzip.GzipFile(fileobj=io.BytesIO(annot_zip_fh.read())) annot_zip_fh.close() annots = [] load_pairs = {} pubs = set() for line in annot_fh: if line.startswith('!'): continue toks = line.strip().split('\t') (xrdb, xrid, details, goid, ref, ev, date) = (toks[0], toks[1], toks[3], toks[4], toks[5], toks[6], toks[13]) if options.get('tair'): import re tair_regex = re.compile('AT[0-9MC]G[0-9][0-9][0-9][0-9][0-9]') first_alias = toks[10].split('|')[0] if tair_regex.match(toks[2]): xrid = toks[2] elif tair_regex.match(toks[9]): xrid = toks[9] elif tair_regex.match(first_alias): xrid = first_alias if options.get('only_wb') and (toks[0] != 'WB'): continue if details == 'NOT': continue if accepted_evcodes is not None and not (ev in accepted_evcodes): continue if options.get('leading') is not None: xrid = xrid.split(':')[1] try: load_pairs[xrdb].append(xrid) except KeyError: load_pairs[xrdb] = [ xrid, ] refs = ref.split('|') for ref_item in refs: if ref_item.startswith('PMID:'): pubs.add(ref_item.split(':')[1]) else: logger.info("Unknown publication key %s", ref_item) annots.append((xrdb, xrid, goid, ref, date)) xref_cache = {} if options.get('pseudomonas'): logger.info('Pseudomonas entered') for (xrdb, xrids) in load_pairs.iteritems(): gene_objs = Gene.objects.filter(systematic_name__in=xrids) logger.info( "Mapped %s Pseudomonas genes from the database using gene systematic name.", gene_objs.count()) for gene_obj in gene_objs: xref_cache[(xrdb, gene_obj.systematic_name)] = gene_obj else: for (xrdb, xrids) in load_pairs.iteritems(): if xrdb in DB_REMAP: xrdb = DB_REMAP[xrdb] try: xrdb_obj = CrossRefDB.objects.get(name=xrdb) except CrossRefDB.DoesNotExist: logger.warning("Couldn't find the cross reference DB %s.", xrdb) continue xrid_objs = CrossRef.objects.filter( crossrefdb=xrdb_obj).filter(xrid__in=xrids) logger.info("Mapped %s cross references from %s", xrid_objs.count(), xrdb) for xrid_obj in xrid_objs: xref_cache[(xrdb, xrid_obj.xrid)] = xrid_obj.gene load_pmids(pubs) pub_cache = {} pub_values = Publication.objects.filter(pmid__in=pubs).only( 'id', 'pmid').values() for pub in pub_values: pub_cache[pub['pmid']] = pub['id'] for annot in annots: (xrdb, xrid, goid, ref, date) = annot if xrdb in DB_REMAP: xrdb = DB_REMAP[xrdb] try: gene = xref_cache[(xrdb, xrid)] except KeyError: logger.debug("Couldn't find xrid %s in xrdb %s.", xrid, xrdb) logger.info("Couldn't find xrid %s in xrdb %s.", xrid, xrdb) continue refs = ref.split('|') pub = None for ref_item in refs: if ref_item.startswith('PMID:'): try: pub = pub_cache[int(ref_item.split(':')[1])] except KeyError: pub = None gene_ontology.add_annotation(go_id=goid, gid=gene.pk, ref=pub, date=date, direct=True) gene_ontology.populated = True #mark annotated gene_ontology.propagate() #prop annotations evlist = list(accepted_evcodes) for (term_id, term) in gene_ontology.go_terms.iteritems(): if term.annotations: slug = slugify(' '.join( (term.go_id, org.scientific_name, term.full_name)))[:50] #make first 50 chars into a slug namespace = GO_NAMESPACE_MAP[term.get_namespace()] go_id = term.go_id.split(':')[1] #construct title title = 'GO' + '-' + namespace + '-' + go_id + ':' + term.full_name #construct abstract #write evidence as string evclause = '' if len(evlist): evclause = ' Only annotations with evidence coded as ' if len(evlist) == 1: evclause = evclause + evlist[0] else: evclause = evclause + ', '.join( evlist[:-1]) + ' or ' + evlist[-1] evclause = evclause + ' are included.' if term.description: description = term.description + ' Annotations are propagated through transitive closure as recommended by the GO Consortium.' + evclause else: logger.info("No description on term %s", term) #get geneset changed = False try: gs_obj = Geneset.objects.get(slug=slug, creator=user) changed = False #flag to know if we need to call save #all these genesets should be public if not gs_obj.public: gs_obj.public = True changed = True if gs_obj.title != title: gs_obj.title = title changed = True if gs_obj.abstract != description: gs_obj.abstract = description changed = True except Geneset.DoesNotExist: gs_obj = Geneset(title=title, slug=slug, creator=user, organism=org, public=True, abstract=description) changed = True #if anything changed if changed: gs_obj.save() if options.get('initial'): #disable commit field's auto_now_add, allows us to set a prior annotation date commit_date = Version._meta.get_field_by_name( 'commit_date')[0] commit_date.auto_now_add = False logger.info( 'Initial load. Need to construct versions of %s from annotation date.', term.go_id) date_annots = {} for annotation in term.annotations: date = timezone.make_aware( datetime.strptime(annotation.date, '%Y%m%d'), timezone.get_default_timezone()) try: date_annots[date].append(annotation) except KeyError: date_annots[date] = [ annotation, ] annots_as_of_date = set() prior_annots = set() prior_version = None for (date, annots) in sorted(date_annots.iteritems()): annots_as_of_date.update([(annotation.gid, annotation.ref) for annotation in annots]) if (annots_as_of_date == prior_annots ): #if nothing changed, continue continue v_obj = Version(geneset=gs_obj, creator=user, parent=prior_version, commit_date=date) v_obj.description = "Added " + str( len(annots) ) + " annotations from GO based on the dates provided in the GO annotation file." v_obj.annotations = annots_as_of_date v_obj.save() prior_version = v_obj prior_annots = annots_as_of_date.copy() #re-enable auto_now_add commit_date.auto_now_add = True else: #load annotations most_recent_versions = Version.objects.filter( geneset=gs_obj).order_by('-commit_date')[:1] annots = set([(annotation.gid, annotation.ref) for annotation in term.annotations]) description = '' most_recent_version = None if most_recent_versions: most_recent_version = most_recent_versions[0] if (most_recent_version.commit_date > timezone.now()): logger.error('Version from the future: %s.', most_recent_version) new = annots - most_recent_version.annotations removed = most_recent_version.annotations - annots if (new or removed): description = description + 'Added ' + str( len(new)) + ' and removed ' + str( len(removed)) + ' annotations from GO.' else: description = 'Created with ' + str( len(annots)) + ' annotations from GO.' if description: v_obj = Version(geneset=gs_obj, creator=user, parent=most_recent_version, commit_date=timezone.now()) v_obj.description = description v_obj.annotations = annots v_obj.save()