Example #1
0
def ExtractExperimentIDs_tax(taxid):
    ''' Extract experiments which have runs associated from taxid
    >>> ExtractExperimentIDs_tax('211968')
    ['SRX1308653', 'SRX1308716', 'SRX1308789', 'SRX1308879', 'SRX337751']
    '''
    ena_url = ('http://www.ebi.ac.uk/ena/data/warehouse/search?'
               'query="tax_tree(%s)"&'
               'result=read_experiment')%(taxid)
    countquery = '&resultcount'
    display = '&display=report&fields=experiment_accession'
    # Find number of entries for the provided taxid 
    count = 0
    with openurl(ena_url+countquery) as u:
        for l in u:
            l = l.strip()
            if ':' in l:
                tmp = l.split(':')
                if tmp[0] == 'Number of results':
                    count = int(tmp[1].replace(',',''))
    # Extract experiment IDs
    experiments = []
    if count > 0:
        length = 100000
        pages = ceil(count/float(length))
        for p in xrange(pages):
            page_offset = '&offset=%s&length=%s'%(p*length+1, length)
            with openurl(ena_url+display+page_offset) as u:
                header = u.readline()
                for l in u:
                    l = l.strip()
                    if l[:3] in acctypes and acctypes[l[:3]] == 'experiment':
                        experiments.append(l)
                    else:
                        print("Unknown Experiment ID: %s (taxid=%s)"%(l,taxid))
    return experiments
Example #2
0
def ExtractTaxIDfromSearchTerm(query):
    ''' Extract taxonomy ID from NCBI taxonomy search
    >>> ExtractTaxIDfromSearchTerm('Salmonella')
    590
    '''
    ncbi_url = 'http://www.ncbi.nlm.nih.gov/taxonomy/?term=%s&report=taxid'%(
        query)
    # Find number of entries for the provided taxid 
    taxid = None
    with openurl(ncbi_url) as u:
        for l in u:
            # remove html tags
            l = re.sub('<.+?>', '', l)
            l = l.strip()
            if l == '': continue
            try: taxid = int(l)
            except: print("Error: Unhandled result from taxid search! (%s)"%l)
    return taxid
Example #3
0
def ExtractExperimentIDs_acc(sample_accession):
    ''' Extract experiments which have runs associated
    >>> ExtractExperimentIDs_acc('ERS397989')
    ['ERX385098']
    >>> ExtractExperimentIDs_acc('SRS331977')
    ['SRX146831', 'SRX365746', 'SRX146834', 'SRX146829', 'SRX146822', 'SRX146814', 'SRX146806']
    '''
    experiments = {}
    sra_url = 'http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=%s'
    with openurl(sra_url%(sample_accession)) as u:
        headers = u.readline()
        try:
            idx = headers.split(',').index("Experiment")
        except:
            print headers
        else:
            for l in u:
                l = l.strip()
                if l == '': continue
                if l[0] == '#': continue
                exp = l.split(',')[idx].strip()
                if not exp in experiments: experiments[exp] = 1
    return experiments.keys()
Example #4
0
 def ExtractData(self, query):
     ''' Extract Sample Metadata '''
     new_platforms = []
     new_seqtypes = []
     # New approach using runinfo list
     with openurl(self.sra_url1%(query)) as u:
         headers = u.readline().split(',')
         indexes = [(x, headers.index(x)) for x in ["Run", "Experiment",
             "Sample", "SRAStudy", "BioSample", "Platform", "LibraryLayout",
             "SampleName", "ScientificName", "CenterName"]]
         for l in u:
             l = l.strip()
             if l == '': continue
             if l[0] == '#': continue
             d = l.split(',')
             self.accessions['run'] = d[indexes[0][1]]
             self.accessions['experiment'] = d[indexes[1][1]]
             self.accessions['sample'] = d[indexes[2][1]]
             self.accessions['study'] = d[indexes[3][1]]
             self.accessions['biosample'] = d[indexes[4][1]]
             platform = d[indexes[5][1]].lower()
             if platform in platforms:
                 self['sequencing_platform'] = platforms[platform]
             else:
                 self['sequencing_platform'] = 'unknown'
                 if not platform in new_platforms:
                     new_platforms.append(platform)
             seqtype = d[indexes[6][1]].lower()
             if seqtype in sequencing_types:
                 self['sequencing_type'] = sequencing_types[seqtype]
             else:
                 self['sequencing_type'] = 'unknown'
                 if not seqtype in new_seqtypes:
                     new_seqtypes.append(seqtype)
             self['sample_name'] = d[indexes[7][1]]
             self['organism'] = d[indexes[8][1]]
             self['collected_by'] = d[indexes[9][1]]
             self['biosample'] = self.accessions['biosample']
             break # Just use the first entry!
             # Should be fixed to handle different query sequences!!!
     with openurl(self.sra_url2%(query)) as u: qdata = u.read()
     # Extract sample attributes
     match = re.findall(r'Sample Attributes: (.+)\n', qdata)
     lcs = {} # location parts
     host = None
     source = None
     for answer in match:
         for attributes in answer.split(';'):
             stat = attributes.split('=')
             att = stat[0].strip('/ ').lower().replace('\'', '')
             val = stat[1].strip('\' ').replace('\'', '\`')
             if att in ['geo_loc_name', 'geographic location']:
                 self.__interpret_loc(val)
             elif att == 'serovar':
                 self['subtype']['serovar'] = val
             elif att == 'mlst':
                 self['subtype']['mlst'] = val
             elif att in ['scientific_name', 'scientific name']:
                 self['organism'] = val
             elif att == 'strain':
                 self['strain'] = val
             elif att in ['isolation_source', 'isolation source']:
                 source = val
             elif att in ['host', 'specific_host', 'specific host']:
                 host = val
             elif att == 'BioSample':
                 self['biosample'] = val
             elif att in ['collection_date', 'collection date']:
                 self['collection_date'] = self.__format_date(
                     *self.__interpret_date(val)
                 )
                 if self['collection_date'] == '':
                     _logger.warning(
                         'Date Empty: %s, %s',
                         val, query
                     )
             elif att in ['collected_by', 'collected by']:
                 self['collected_by'] = val
             elif att in ['country', 'region', 'city', 'zip_code']:
                 lcs[att] = val
             else:
                 self['notes'] = '%s %s: %s,' % (
                     self['notes'], att, val)
         if lcs != {}:
             h = ['country', 'region', 'city', 'zip_code']
             self.__interpret_loc( ','.join([lcs[x] for x in h if x in lcs]))
     # Handle Isolation source
     cats = []
     if host is not None:
         for d in ontology:
             cats = [d[k][0] for k in d.keys() if k in host.lower()]
             if cats:
                 break
     
     if not cats and host not in self.new_ontologies:
         self.new_ontologies[host] = query
     
     if (not cats or cats[0] == 'unknown') and source is not None:
         for d in ontology:
             cats = [d[k][0] for k in d.keys() if k in source.lower()]
             if cats:
                 break
         
         if not cats and source not in self.new_ontologies:
             self.new_ontologies[source] = query
     
     if cats:
         self['isolation_source'] = cats[0]
         _logger.warning(
             'Source identified: %s (%s, %s), %s',
             self['isolation_source'], host, source, query
         )
     else:
         if host is None:   host   = 'unknown'
         if source is None: source = 'unknown'
         _logger.warning(
             'Source not identified: (%s, %s), %s',
             host, source, query
         )
     self['source_note'] = source
     
     # Extract Run IDs associated with the sample
     #Run #1: ERR276921, 1356661 spots, 271332200 bases
     self.runIDs = re.findall(r'Run #\d+: (.+?),.+', qdata)
     
     # Notify Curators By Email
     _logger.info('Make mail? %s'%(mail is not None))
     if mail is not None:
         _logger.info('Any unknowns? %s'%(len(self.new_ontologies) > 0))
         if len(self.new_ontologies) > 0:
             _logger.debug(mail.test(
                 'New isolation source...',
                 'Sources not identified:\n%s\n'%(
                     '\n'.join(map(', '.join, self.new_ontologies.items()))
                     )
                 ))
             mail.send(
                 'New isolation source...',
                 'Sources not identified:\n%s\n'%(
                     '\n'.join(map(', '.join, self.new_ontologies.items()))
                     )
                 )
         if len(new_platforms) > 0:
             _logger.debug(mail.test(
                 'New platforms...',
                 'Platforms not accounted for:\n%s\n'%(
                     '\n'.join(new_platforms)
                     )
                 ))
             mail.send(
                 'New platforms...',
                 'Platforms not accounted for:\n%s\n'%(
                     '\n'.join(new_platforms)
                     )
                 )
     elif len(self.new_ontologies) > 0:
         _logger.debug(
             "NO MAIL!",
             'Sources not identified:\n%s\n'%(
                 '\n'.join(map(', '.join, self.new_ontologies.items()))
                 )
             )