Ejemplo n.º 1
0
 def ExtractData(self, query):
     ''' Extract Sample Metadata '''
     new_platforms = []
     new_seqtypes = []
     # New approach using runinfo list
     with openurl(self.sra_url1%(query)) as u:
         headers = u.readline().split(',')
         indexes = [(x, headers.index(x)) for x in ["Run", "Experiment",
             "Sample", "SRAStudy", "BioSample", "Platform", "LibraryLayout",
             "SampleName", "ScientificName", "CenterName"]]
         for l in u:
             l = l.strip()
             if l == '': continue
             if l[0] == '#': continue
             d = l.split(',')
             self.accessions['run'] = d[indexes[0][1]]
             self.accessions['experiment'] = d[indexes[1][1]]
             self.accessions['sample'] = d[indexes[2][1]]
             self.accessions['study'] = d[indexes[3][1]]
             self.accessions['biosample'] = d[indexes[4][1]]
             platform = d[indexes[5][1]].lower()
             if platform in platforms:
                 self['sequencing_platform'] = platforms[platform]
             else:
                 self['sequencing_platform'] = 'unknown'
                 if not platform in new_platforms:
                     new_platforms.append(platform)
             seqtype = d[indexes[6][1]].lower()
             if seqtype in sequencing_types:
                 self['sequencing_type'] = sequencing_types[seqtype]
             else:
                 self['sequencing_type'] = 'unknown'
                 if not seqtype in new_seqtypes:
                     new_seqtypes.append(seqtype)
             self['sample_name'] = d[indexes[7][1]]
             self['organism'] = d[indexes[8][1]]
             self['collected_by'] = d[indexes[9][1]]
             self['biosample'] = self.accessions['biosample']
             break # Just use the first entry!
             # Should be fixed to handle different query sequences!!!
     with openurl(self.sra_url2%(query)) as u: qdata = u.read()
     # Extract sample attributes
     match = re.findall(r'Sample Attributes: (.+)\n', qdata)
     lcs = {} # location parts
     host = None
     source = None
     for answer in match:
         for attributes in answer.split(';'):
             stat = attributes.split('=')
             att = stat[0].strip('/ ').lower().replace('\'', '')
             val = stat[1].strip('\' ').replace('\'', '\`')
             if att in ['geo_loc_name', 'geographic location']:
                 self.__interpret_loc(val)
             elif att == 'serovar':
                 self['subtype']['serovar'] = val
             elif att == 'mlst':
                 self['subtype']['mlst'] = val
             elif att in ['scientific_name', 'scientific name']:
                 self['organism'] = val
             elif att == 'strain':
                 self['strain'] = val
             elif att in ['isolation_source', 'isolation source']:
                 source = val
             elif att in ['host', 'specific_host', 'specific host']:
                 host = val
             elif att == 'BioSample':
                 self['biosample'] = val
             elif att in ['collection_date', 'collection date']:
                 self['collection_date'] = self.__format_date(
                     *self.__interpret_date(val)
                 )
                 if self['collection_date'] == '':
                     _logger.warning(
                         'Date Empty: %s, %s',
                         val, query
                     )
             elif att in ['collected_by', 'collected by']:
                 self['collected_by'] = val
             elif att in ['country', 'region', 'city', 'zip_code']:
                 lcs[att] = val
             else:
                 self['notes'] = '%s %s: %s,' % (
                     self['notes'], att, val)
         if lcs != {}:
             h = ['country', 'region', 'city', 'zip_code']
             self.__interpret_loc( ','.join([lcs[x] for x in h if x in lcs]))
     # Handle Isolation source
     cats = []
     if host is not None:
         for d in ontology:
             cats = [d[k][0] for k in d.keys() if k in host.lower()]
             if cats:
                 break
     
     if not cats and host not in self.new_ontologies:
         self.new_ontologies[host] = query
     
     if (not cats or cats[0] == 'unknown') and source is not None:
         for d in ontology:
             cats = [d[k][0] for k in d.keys() if k in source.lower()]
             if cats:
                 break
         
         if not cats and source not in self.new_ontologies:
             self.new_ontologies[source] = query
     
     if cats:
         self['isolation_source'] = cats[0]
         _logger.warning(
             'Source identified: %s (%s, %s), %s',
             self['isolation_source'], host, source, query
         )
     else:
         if host is None:   host   = 'unknown'
         if source is None: source = 'unknown'
         _logger.warning(
             'Source not identified: (%s, %s), %s',
             host, source, query
         )
     self['source_note'] = source
     
     # Extract Run IDs associated with the sample
     #Run #1: ERR276921, 1356661 spots, 271332200 bases
     self.runIDs = re.findall(r'Run #\d+: (.+?),.+', qdata)
     
     # Notify Curators By Email
     _logger.info('Make mail? %s'%(mail is not None))
     if mail is not None:
         _logger.info('Any unknowns? %s'%(len(self.new_ontologies) > 0))
         if len(self.new_ontologies) > 0:
             _logger.debug(mail.test(
                 'New isolation source...',
                 'Sources not identified:\n%s\n'%(
                     '\n'.join(map(', '.join, self.new_ontologies.items()))
                     )
                 ))
             mail.send(
                 'New isolation source...',
                 'Sources not identified:\n%s\n'%(
                     '\n'.join(map(', '.join, self.new_ontologies.items()))
                     )
                 )
         if len(new_platforms) > 0:
             _logger.debug(mail.test(
                 'New platforms...',
                 'Platforms not accounted for:\n%s\n'%(
                     '\n'.join(new_platforms)
                     )
                 ))
             mail.send(
                 'New platforms...',
                 'Platforms not accounted for:\n%s\n'%(
                     '\n'.join(new_platforms)
                     )
                 )
     elif len(self.new_ontologies) > 0:
         _logger.debug(
             "NO MAIL!",
             'Sources not identified:\n%s\n'%(
                 '\n'.join(map(', '.join, self.new_ontologies.items()))
                 )
             )
Ejemplo n.º 2
0
 def __interpret_loc(self, val):
     '''  '''
     geo_dict = {
         'country': '',
         'region': '',
         'city': '',
         'zip_code': '',
         'longitude': '',
         'latitude': '',
         'location_note': ''
     }
     type_map = {
         'country': 'country',
         'postal_code': 'zip_code',
         'administrative_area_level_1': 'region',
         'locality': 'city'
     }
     val = val.lower()
     if val not in location_hash.keys():
         try:
             g = geocoder.google(val)
             if g.status != 'OK':
                 if ',' in val:
                     # Try with only country
                     val2 = val.split(',')[0]
                     _logger.warning(
                         ('Geocoder failed (%s)!,'
                          'trying with country only... (%s)'), val, val2)
                     g = geocoder.google(val2)
                     if g.status != 'OK':
                         raise Exception(g.status)
                 else:
                     raise Exception(g.status)
         except Exception, e:
             _logger.warning('Geocoder error %s', query)
             location_hash[val] = ('', '', '', '', val)
         else:
             geo_dict['longitude'] = g.lng
             geo_dict['latitude'] = g.lat
             geo_dict['location_note'] = g.location
             try:
                 results = g.content['results'][0]
                 for x in results['address_components']:
                     for a_type in x['types']:
                         if a_type in type_map:
                             m_type = type_map[a_type]
                             geo_dict[m_type] = x['long_name']
                             break
             except:
                 try:
                     a_tmp = g.address.split(',')
                     if len(address_tmp) > 2:
                         geo_dict['city'] = a_tmp[0]
                         geo_dict['region'] = a_tmp[-2]
                         geo_dict['country'] = a_tmp[-1]
                     elif len(address_tmp) == 2:
                         geo_dict['city'] = a_tmp[0]
                         geo_dict['country'] = a_tmp[-1]
                     elif len(address_tmp) == 1:
                         geo_dict['country'] = a_tmp[0]
                 except:
                     pass
                 else:
                     try:
                         geo_dict['zip_code'] = int(
                             geo_dict['city'].split(' ')[0]
                         )
                     except:
                         pass
             location_hash[val] = geo_dict