def fetch_annotations_for_ids(annotations_dir, ref_ids, proxy=None):
    ref_ids = list(chain(*[__range_of_ref_ids(line) for line in ref_ids]))
    if None in ref_ids:
        return 1

    if not isdir(annotations_dir):
        mkdir(annotations_dir)

    if proxy:
        setup_http_proxy(*proxy.split(':'))

    if ref_ids == []:
        log.info('   No references have been found.')
        ids = raw_input('   Put reference ids manually:').split()
        if ids == []:
            log.error('   No references :(')
            return 1

    log.info('   IDs: %s' % ', '.join(ref_ids))

    for i, ref_id in enumerate(ref_ids):
        log.info('   Fetching annotations for %s...' % ref_id)

        try:
            try:
                fetch_handle = Entrez.efetch(db='nucleotide', id=ref_id,
                                             retmode='text', rettype='gbwithparts')

            except urllib2.HTTPError as e:
                log.error('   Error: cannot fetch data for reference id ' + ref_id)
                log.error('   Http error code: %s, reason: %s' % (str(e.code), str(e.reason)))
                return -1
            else:
                gb_fpath = join(annotations_dir, ref_id + '.gb')
                with open(gb_fpath, 'w') as file:
                    file.write(fetch_handle.read())

                rec = SeqIO.read(gb_fpath, 'genbank')
                genes_number = 0
                cds_number = 0
                for f in rec.features:
                    if f.type == 'gene':
                        genes_number += 1
                    if f.type == 'CDS':
                        cds_number += 1
                log.info('       ' + rec.description)
                log.info('       %d genes, %d coding regions found.' % (genes_number, cds_number))
                log.info('       saved %s' % gb_fpath)
                if i + 1 < len(ref_ids):
                    log.info('')

        except KeyboardInterrupt, e:
            return 1
def fetch_annotations_for_species_from_ftp(save_dir, species_names, proxy=None, clip=None):
    if not species_names:
        log.error('   No species names')
        return 1

    if proxy:
        setup_http_proxy(*proxy.split(':'))

    if not isdir(save_dir):
        mkdir(save_dir)

    ftp = FTP('ftp.ncbi.nih.gov')
    log.debug('   Logging in ' + ftp.login())
    ftp.cwd('genomes/Bacteria')
    for i, dirname in enumerate(ftp.nlst()):
        for sp_i, sp in enumerate(species_names):
            sp = sp.replace(' ', '_').strip()
            if sp == '' or sp[0] == '#':
                continue

            if dirname.startswith(sp):
                log.debug('   Scanning ' + dirname)
                for remote_fpath in ftp.nlst(dirname):
                    if remote_fpath.endswith('.gbk'):
                        dest_fname = str(i) + '_' + str(sp_i) + '_' + basename(remote_fpath)
                        dest_fpath = join(save_dir, dest_fname)
                        ftp.retrbinary('RETR ' + remote_fpath, open(dest_fpath, 'wb').write)

                        rec = SeqIO.read(dest_fpath, 'gb')
                        log.info('       Definition: ' + rec.description)
                        # if 'plasmid' in rec.description:
                        #     remove(dest_fpath)
                        # else:
                        log.info('       saved ' + dest_fpath)
                        log.info('')
    return 0