def fetch_annotations_for_ids(annotations_dir, ref_ids, proxy=None): ref_ids = list(chain(*[__range_of_ref_ids(line) for line in ref_ids])) if None in ref_ids: return 1 if not isdir(annotations_dir): mkdir(annotations_dir) if proxy: setup_http_proxy(*proxy.split(':')) if ref_ids == []: log.info(' No references have been found.') ids = raw_input(' Put reference ids manually:').split() if ids == []: log.error(' No references :(') return 1 log.info(' IDs: %s' % ', '.join(ref_ids)) for i, ref_id in enumerate(ref_ids): log.info(' Fetching annotations for %s...' % ref_id) try: try: fetch_handle = Entrez.efetch(db='nucleotide', id=ref_id, retmode='text', rettype='gbwithparts') except urllib2.HTTPError as e: log.error(' Error: cannot fetch data for reference id ' + ref_id) log.error(' Http error code: %s, reason: %s' % (str(e.code), str(e.reason))) return -1 else: gb_fpath = join(annotations_dir, ref_id + '.gb') with open(gb_fpath, 'w') as file: file.write(fetch_handle.read()) rec = SeqIO.read(gb_fpath, 'genbank') genes_number = 0 cds_number = 0 for f in rec.features: if f.type == 'gene': genes_number += 1 if f.type == 'CDS': cds_number += 1 log.info(' ' + rec.description) log.info(' %d genes, %d coding regions found.' % (genes_number, cds_number)) log.info(' saved %s' % gb_fpath) if i + 1 < len(ref_ids): log.info('') except KeyboardInterrupt, e: return 1
def fetch_annotations_for_species_from_ftp(save_dir, species_names, proxy=None, clip=None): if not species_names: log.error(' No species names') return 1 if proxy: setup_http_proxy(*proxy.split(':')) if not isdir(save_dir): mkdir(save_dir) ftp = FTP('ftp.ncbi.nih.gov') log.debug(' Logging in ' + ftp.login()) ftp.cwd('genomes/Bacteria') for i, dirname in enumerate(ftp.nlst()): for sp_i, sp in enumerate(species_names): sp = sp.replace(' ', '_').strip() if sp == '' or sp[0] == '#': continue if dirname.startswith(sp): log.debug(' Scanning ' + dirname) for remote_fpath in ftp.nlst(dirname): if remote_fpath.endswith('.gbk'): dest_fname = str(i) + '_' + str(sp_i) + '_' + basename(remote_fpath) dest_fpath = join(save_dir, dest_fname) ftp.retrbinary('RETR ' + remote_fpath, open(dest_fpath, 'wb').write) rec = SeqIO.read(dest_fpath, 'gb') log.info(' Definition: ' + rec.description) # if 'plasmid' in rec.description: # remove(dest_fpath) # else: log.info(' saved ' + dest_fpath) log.info('') return 0