def collect_proteomes_and_annotaitons(input_dir): proteomes = [] annotations = [] files = listdir(input_dir) if not files: interrupt('Directory contains no files.') for f in (join(input_dir, f) for f in files if isfile(join(input_dir, f))): if '.' in f and splitext(f)[1] in ['.fasta', '.faa', '.fa', '.fsa']: try: log.debug(' Checking if %s is fasta.' % f) next(SeqIO.parse(f, 'fasta')) except ValueError, e: pass else: proteomes.append(f) continue if '.' in f and splitext(f)[1] in ['.gb', '.genbank', '.gbk']: try: log.debug(' Checking if %s is genbank.' % f) SeqIO.read(f, 'genbank') except Exception, e: log.debug(str(e) + ', ' + f) else: annotations.append(f)
def run(starting_from_here=False): if p.species_list: if not test_entrez_conn(): log.error(' No internet connection: cannot fetch annotations.') return 4 log.debug(' Using species list: ' + str(p.species_list)) gb_ids = read_list(p.species_list) log.debug('species_list: ' + str(gb_ids)) res = fetch_annotations_species_name_entrez(config.annotations_dir, gb_ids, p.proxy) if res != 0: return res return make_proteomes(config.annotations_dir, config.proteomes_dir) elif p.ids_list: if not test_entrez_conn(): log.error('No internet connection: cannot fetch annotations.') return 4 log.debug(' Using ref ids: ' + str(p.ids_list)) ref_ids = read_list(p.ids_list) res = fetch_annotations_for_ids(config.annotations_dir, ref_ids) if res != 0: return res return make_proteomes(config.annotations_dir, config.proteomes_dir) else: proteomes, annotations = [], [] if p.proteomes: proteomes, annotations = collect_proteomes_and_annotaitons(p.proteomes) if proteomes == []: interrupt('No fasta found in ' + p.proteomes) if p.annotations: proteomes, annotations = collect_proteomes_and_annotaitons(p.annotations) if annotations == []: interrupt('No gb files found in ' + p.annotations) #if not proteomes and not annotations: # interrupt('Directory must contain fasta or genbank files.') # #if proteomes and annotations: # log.warn('Directory %s contains both fasta and genbank files, using fasta.') if annotations: if not isdir(config.annotations_dir): mkdir(config.annotations_dir) for annotation in annotations: copy(annotation, config.annotations_dir) return make_proteomes(config.annotations_dir, config.proteomes_dir) elif proteomes: if not isdir(config.proteomes_dir): mkdir(config.proteomes_dir) if p.download_anno: if not test_entrez_conn(): #log.error(' Error: no internet connection, cannot fetch annotations. ' # 'You can start without a --no-fetch option, in this case ' # 'a reduced version of orthogroups.txt with no annotations will be produced.') #return 1 log.error(' Warning: no internet connection, cannot fetch annotations. ' 'A reduced version of orthogroups.txt with no annotations will be produced.') else: # ref_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes] # fetch_annotations_for_ids(config.annotations_dir, ref_ids) gb_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes] log.debug('ids_list: ' + str(gb_ids)) res = fetch_annotations_for_ids(config.annotations_dir, gb_ids, p.proxy) if res > 0: return res if res == -1: p.download_anno = False return adjust_proteomes(proteomes, config.proteomes_dir, p.prot_id_field)
def run(starting_from_here=False): if p.species_list: if not test_entrez_conn(): log.error( ' No internet connection: cannot fetch annotations.') return 4 log.debug(' Using species list: ' + str(p.species_list)) gb_ids = read_list(p.species_list) log.debug('species_list: ' + str(gb_ids)) res = fetch_annotations_species_name_entrez( config.annotations_dir, gb_ids, p.proxy) if res != 0: return res return make_proteomes(config.annotations_dir, config.proteomes_dir) elif p.ids_list: if not test_entrez_conn(): log.error('No internet connection: cannot fetch annotations.') return 4 log.debug(' Using ref ids: ' + str(p.ids_list)) ref_ids = read_list(p.ids_list) res = fetch_annotations_for_ids(config.annotations_dir, ref_ids) if res != 0: return res return make_proteomes(config.annotations_dir, config.proteomes_dir) else: proteomes, annotations = [], [] if p.proteomes: proteomes, annotations = collect_proteomes_and_annotaitons( p.proteomes) if proteomes == []: interrupt('No fasta found in ' + p.proteomes) if p.annotations: proteomes, annotations = collect_proteomes_and_annotaitons( p.annotations) if annotations == []: interrupt('No gb files found in ' + p.annotations) #if not proteomes and not annotations: # interrupt('Directory must contain fasta or genbank files.') # #if proteomes and annotations: # log.warn('Directory %s contains both fasta and genbank files, using fasta.') if annotations: if not isdir(config.annotations_dir): mkdir(config.annotations_dir) for annotation in annotations: copy(annotation, config.annotations_dir) return make_proteomes(config.annotations_dir, config.proteomes_dir) elif proteomes: if not isdir(config.proteomes_dir): mkdir(config.proteomes_dir) if p.download_anno: if not test_entrez_conn(): #log.error(' Error: no internet connection, cannot fetch annotations. ' # 'You can start without a --no-fetch option, in this case ' # 'a reduced version of orthogroups.txt with no annotations will be produced.') #return 1 log.error( ' Warning: no internet connection, cannot fetch annotations. ' 'A reduced version of orthogroups.txt with no annotations will be produced.' ) else: # ref_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes] # fetch_annotations_for_ids(config.annotations_dir, ref_ids) gb_ids = [ splitext(basename(prot_file))[0] for prot_file in proteomes ] log.debug('ids_list: ' + str(gb_ids)) res = fetch_annotations_for_ids( config.annotations_dir, gb_ids, p.proxy) if res > 0: return res if res == -1: p.download_anno = False return adjust_proteomes(proteomes, config.proteomes_dir, p.prot_id_field)