def ExtractExperimentIDs_tax(taxid): ''' Extract experiments which have runs associated from taxid >>> ExtractExperimentIDs_tax('211968') ['SRX1308653', 'SRX1308716', 'SRX1308789', 'SRX1308879', 'SRX337751'] ''' ena_url = ('http://www.ebi.ac.uk/ena/data/warehouse/search?' 'query="tax_tree(%s)"&' 'result=read_experiment')%(taxid) countquery = '&resultcount' display = '&display=report&fields=experiment_accession' # Find number of entries for the provided taxid count = 0 with openurl(ena_url+countquery) as u: for l in u: l = l.strip() if ':' in l: tmp = l.split(':') if tmp[0] == 'Number of results': count = int(tmp[1].replace(',','')) # Extract experiment IDs experiments = [] if count > 0: length = 100000 pages = ceil(count/float(length)) for p in xrange(pages): page_offset = '&offset=%s&length=%s'%(p*length+1, length) with openurl(ena_url+display+page_offset) as u: header = u.readline() for l in u: l = l.strip() if l[:3] in acctypes and acctypes[l[:3]] == 'experiment': experiments.append(l) else: print("Unknown Experiment ID: %s (taxid=%s)"%(l,taxid)) return experiments
def ExtractTaxIDfromSearchTerm(query): ''' Extract taxonomy ID from NCBI taxonomy search >>> ExtractTaxIDfromSearchTerm('Salmonella') 590 ''' ncbi_url = 'http://www.ncbi.nlm.nih.gov/taxonomy/?term=%s&report=taxid'%( query) # Find number of entries for the provided taxid taxid = None with openurl(ncbi_url) as u: for l in u: # remove html tags l = re.sub('<.+?>', '', l) l = l.strip() if l == '': continue try: taxid = int(l) except: print("Error: Unhandled result from taxid search! (%s)"%l) return taxid
def ExtractExperimentIDs_acc(sample_accession): ''' Extract experiments which have runs associated >>> ExtractExperimentIDs_acc('ERS397989') ['ERX385098'] >>> ExtractExperimentIDs_acc('SRS331977') ['SRX146831', 'SRX365746', 'SRX146834', 'SRX146829', 'SRX146822', 'SRX146814', 'SRX146806'] ''' experiments = {} sra_url = 'http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=%s' with openurl(sra_url%(sample_accession)) as u: headers = u.readline() try: idx = headers.split(',').index("Experiment") except: print headers else: for l in u: l = l.strip() if l == '': continue if l[0] == '#': continue exp = l.split(',')[idx].strip() if not exp in experiments: experiments[exp] = 1 return experiments.keys()
def ExtractData(self, query): ''' Extract Sample Metadata ''' new_platforms = [] new_seqtypes = [] # New approach using runinfo list with openurl(self.sra_url1%(query)) as u: headers = u.readline().split(',') indexes = [(x, headers.index(x)) for x in ["Run", "Experiment", "Sample", "SRAStudy", "BioSample", "Platform", "LibraryLayout", "SampleName", "ScientificName", "CenterName"]] for l in u: l = l.strip() if l == '': continue if l[0] == '#': continue d = l.split(',') self.accessions['run'] = d[indexes[0][1]] self.accessions['experiment'] = d[indexes[1][1]] self.accessions['sample'] = d[indexes[2][1]] self.accessions['study'] = d[indexes[3][1]] self.accessions['biosample'] = d[indexes[4][1]] platform = d[indexes[5][1]].lower() if platform in platforms: self['sequencing_platform'] = platforms[platform] else: self['sequencing_platform'] = 'unknown' if not platform in new_platforms: new_platforms.append(platform) seqtype = d[indexes[6][1]].lower() if seqtype in sequencing_types: self['sequencing_type'] = sequencing_types[seqtype] else: self['sequencing_type'] = 'unknown' if not seqtype in new_seqtypes: new_seqtypes.append(seqtype) self['sample_name'] = d[indexes[7][1]] self['organism'] = d[indexes[8][1]] self['collected_by'] = d[indexes[9][1]] self['biosample'] = self.accessions['biosample'] break # Just use the first entry! # Should be fixed to handle different query sequences!!! with openurl(self.sra_url2%(query)) as u: qdata = u.read() # Extract sample attributes match = re.findall(r'Sample Attributes: (.+)\n', qdata) lcs = {} # location parts host = None source = None for answer in match: for attributes in answer.split(';'): stat = attributes.split('=') att = stat[0].strip('/ ').lower().replace('\'', '') val = stat[1].strip('\' ').replace('\'', '\`') if att in ['geo_loc_name', 'geographic location']: self.__interpret_loc(val) elif att == 'serovar': self['subtype']['serovar'] = val elif att == 'mlst': self['subtype']['mlst'] = val elif att in ['scientific_name', 'scientific name']: self['organism'] = val elif att == 'strain': self['strain'] = val elif att in ['isolation_source', 'isolation source']: source = val elif att in ['host', 'specific_host', 'specific host']: host = val elif att == 'BioSample': self['biosample'] = val elif att in ['collection_date', 'collection date']: self['collection_date'] = self.__format_date( *self.__interpret_date(val) ) if self['collection_date'] == '': _logger.warning( 'Date Empty: %s, %s', val, query ) elif att in ['collected_by', 'collected by']: self['collected_by'] = val elif att in ['country', 'region', 'city', 'zip_code']: lcs[att] = val else: self['notes'] = '%s %s: %s,' % ( self['notes'], att, val) if lcs != {}: h = ['country', 'region', 'city', 'zip_code'] self.__interpret_loc( ','.join([lcs[x] for x in h if x in lcs])) # Handle Isolation source cats = [] if host is not None: for d in ontology: cats = [d[k][0] for k in d.keys() if k in host.lower()] if cats: break if not cats and host not in self.new_ontologies: self.new_ontologies[host] = query if (not cats or cats[0] == 'unknown') and source is not None: for d in ontology: cats = [d[k][0] for k in d.keys() if k in source.lower()] if cats: break if not cats and source not in self.new_ontologies: self.new_ontologies[source] = query if cats: self['isolation_source'] = cats[0] _logger.warning( 'Source identified: %s (%s, %s), %s', self['isolation_source'], host, source, query ) else: if host is None: host = 'unknown' if source is None: source = 'unknown' _logger.warning( 'Source not identified: (%s, %s), %s', host, source, query ) self['source_note'] = source # Extract Run IDs associated with the sample #Run #1: ERR276921, 1356661 spots, 271332200 bases self.runIDs = re.findall(r'Run #\d+: (.+?),.+', qdata) # Notify Curators By Email _logger.info('Make mail? %s'%(mail is not None)) if mail is not None: _logger.info('Any unknowns? %s'%(len(self.new_ontologies) > 0)) if len(self.new_ontologies) > 0: _logger.debug(mail.test( 'New isolation source...', 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) )) mail.send( 'New isolation source...', 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) ) if len(new_platforms) > 0: _logger.debug(mail.test( 'New platforms...', 'Platforms not accounted for:\n%s\n'%( '\n'.join(new_platforms) ) )) mail.send( 'New platforms...', 'Platforms not accounted for:\n%s\n'%( '\n'.join(new_platforms) ) ) elif len(self.new_ontologies) > 0: _logger.debug( "NO MAIL!", 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) )