def ExtractData(self, query): ''' Extract Sample Metadata ''' new_platforms = [] new_seqtypes = [] # New approach using runinfo list with openurl(self.sra_url1%(query)) as u: headers = u.readline().split(',') indexes = [(x, headers.index(x)) for x in ["Run", "Experiment", "Sample", "SRAStudy", "BioSample", "Platform", "LibraryLayout", "SampleName", "ScientificName", "CenterName"]] for l in u: l = l.strip() if l == '': continue if l[0] == '#': continue d = l.split(',') self.accessions['run'] = d[indexes[0][1]] self.accessions['experiment'] = d[indexes[1][1]] self.accessions['sample'] = d[indexes[2][1]] self.accessions['study'] = d[indexes[3][1]] self.accessions['biosample'] = d[indexes[4][1]] platform = d[indexes[5][1]].lower() if platform in platforms: self['sequencing_platform'] = platforms[platform] else: self['sequencing_platform'] = 'unknown' if not platform in new_platforms: new_platforms.append(platform) seqtype = d[indexes[6][1]].lower() if seqtype in sequencing_types: self['sequencing_type'] = sequencing_types[seqtype] else: self['sequencing_type'] = 'unknown' if not seqtype in new_seqtypes: new_seqtypes.append(seqtype) self['sample_name'] = d[indexes[7][1]] self['organism'] = d[indexes[8][1]] self['collected_by'] = d[indexes[9][1]] self['biosample'] = self.accessions['biosample'] break # Just use the first entry! # Should be fixed to handle different query sequences!!! with openurl(self.sra_url2%(query)) as u: qdata = u.read() # Extract sample attributes match = re.findall(r'Sample Attributes: (.+)\n', qdata) lcs = {} # location parts host = None source = None for answer in match: for attributes in answer.split(';'): stat = attributes.split('=') att = stat[0].strip('/ ').lower().replace('\'', '') val = stat[1].strip('\' ').replace('\'', '\`') if att in ['geo_loc_name', 'geographic location']: self.__interpret_loc(val) elif att == 'serovar': self['subtype']['serovar'] = val elif att == 'mlst': self['subtype']['mlst'] = val elif att in ['scientific_name', 'scientific name']: self['organism'] = val elif att == 'strain': self['strain'] = val elif att in ['isolation_source', 'isolation source']: source = val elif att in ['host', 'specific_host', 'specific host']: host = val elif att == 'BioSample': self['biosample'] = val elif att in ['collection_date', 'collection date']: self['collection_date'] = self.__format_date( *self.__interpret_date(val) ) if self['collection_date'] == '': _logger.warning( 'Date Empty: %s, %s', val, query ) elif att in ['collected_by', 'collected by']: self['collected_by'] = val elif att in ['country', 'region', 'city', 'zip_code']: lcs[att] = val else: self['notes'] = '%s %s: %s,' % ( self['notes'], att, val) if lcs != {}: h = ['country', 'region', 'city', 'zip_code'] self.__interpret_loc( ','.join([lcs[x] for x in h if x in lcs])) # Handle Isolation source cats = [] if host is not None: for d in ontology: cats = [d[k][0] for k in d.keys() if k in host.lower()] if cats: break if not cats and host not in self.new_ontologies: self.new_ontologies[host] = query if (not cats or cats[0] == 'unknown') and source is not None: for d in ontology: cats = [d[k][0] for k in d.keys() if k in source.lower()] if cats: break if not cats and source not in self.new_ontologies: self.new_ontologies[source] = query if cats: self['isolation_source'] = cats[0] _logger.warning( 'Source identified: %s (%s, %s), %s', self['isolation_source'], host, source, query ) else: if host is None: host = 'unknown' if source is None: source = 'unknown' _logger.warning( 'Source not identified: (%s, %s), %s', host, source, query ) self['source_note'] = source # Extract Run IDs associated with the sample #Run #1: ERR276921, 1356661 spots, 271332200 bases self.runIDs = re.findall(r'Run #\d+: (.+?),.+', qdata) # Notify Curators By Email _logger.info('Make mail? %s'%(mail is not None)) if mail is not None: _logger.info('Any unknowns? %s'%(len(self.new_ontologies) > 0)) if len(self.new_ontologies) > 0: _logger.debug(mail.test( 'New isolation source...', 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) )) mail.send( 'New isolation source...', 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) ) if len(new_platforms) > 0: _logger.debug(mail.test( 'New platforms...', 'Platforms not accounted for:\n%s\n'%( '\n'.join(new_platforms) ) )) mail.send( 'New platforms...', 'Platforms not accounted for:\n%s\n'%( '\n'.join(new_platforms) ) ) elif len(self.new_ontologies) > 0: _logger.debug( "NO MAIL!", 'Sources not identified:\n%s\n'%( '\n'.join(map(', '.join, self.new_ontologies.items())) ) )
def __interpret_loc(self, val): ''' ''' geo_dict = { 'country': '', 'region': '', 'city': '', 'zip_code': '', 'longitude': '', 'latitude': '', 'location_note': '' } type_map = { 'country': 'country', 'postal_code': 'zip_code', 'administrative_area_level_1': 'region', 'locality': 'city' } val = val.lower() if val not in location_hash.keys(): try: g = geocoder.google(val) if g.status != 'OK': if ',' in val: # Try with only country val2 = val.split(',')[0] _logger.warning( ('Geocoder failed (%s)!,' 'trying with country only... (%s)'), val, val2) g = geocoder.google(val2) if g.status != 'OK': raise Exception(g.status) else: raise Exception(g.status) except Exception, e: _logger.warning('Geocoder error %s', query) location_hash[val] = ('', '', '', '', val) else: geo_dict['longitude'] = g.lng geo_dict['latitude'] = g.lat geo_dict['location_note'] = g.location try: results = g.content['results'][0] for x in results['address_components']: for a_type in x['types']: if a_type in type_map: m_type = type_map[a_type] geo_dict[m_type] = x['long_name'] break except: try: a_tmp = g.address.split(',') if len(address_tmp) > 2: geo_dict['city'] = a_tmp[0] geo_dict['region'] = a_tmp[-2] geo_dict['country'] = a_tmp[-1] elif len(address_tmp) == 2: geo_dict['city'] = a_tmp[0] geo_dict['country'] = a_tmp[-1] elif len(address_tmp) == 1: geo_dict['country'] = a_tmp[0] except: pass else: try: geo_dict['zip_code'] = int( geo_dict['city'].split(' ')[0] ) except: pass location_hash[val] = geo_dict