def run(starting_from_here=False): if p.species_list: if not test_entrez_conn(): log.error( ' No internet connection: cannot fetch annotations.') return 4 log.debug(' Using species list: ' + str(p.species_list)) gb_ids = read_list(p.species_list) log.debug('species_list: ' + str(gb_ids)) res = fetch_annotations_species_name_entrez( config.annotations_dir, gb_ids, p.proxy) if res != 0: return res return make_proteomes(config.annotations_dir, config.proteomes_dir) elif p.ids_list: if not test_entrez_conn(): log.error('No internet connection: cannot fetch annotations.') return 4 log.debug(' Using ref ids: ' + str(p.ids_list)) ref_ids = read_list(p.ids_list) res = fetch_annotations_for_ids(config.annotations_dir, ref_ids) if res != 0: return res return make_proteomes(config.annotations_dir, config.proteomes_dir) else: proteomes, annotations = [], [] if p.proteomes: proteomes, annotations = collect_proteomes_and_annotaitons( p.proteomes) if proteomes == []: interrupt('No fasta found in ' + p.proteomes) if p.annotations: proteomes, annotations = collect_proteomes_and_annotaitons( p.annotations) if annotations == []: interrupt('No gb files found in ' + p.annotations) #if not proteomes and not annotations: # interrupt('Directory must contain fasta or genbank files.') # #if proteomes and annotations: # log.warn('Directory %s contains both fasta and genbank files, using fasta.') if annotations: if not isdir(config.annotations_dir): mkdir(config.annotations_dir) for annotation in annotations: copy(annotation, config.annotations_dir) return make_proteomes(config.annotations_dir, config.proteomes_dir) elif proteomes: if not isdir(config.proteomes_dir): mkdir(config.proteomes_dir) if p.download_anno: if not test_entrez_conn(): #log.error(' Error: no internet connection, cannot fetch annotations. ' # 'You can start without a --no-fetch option, in this case ' # 'a reduced version of orthogroups.txt with no annotations will be produced.') #return 1 log.error( ' Warning: no internet connection, cannot fetch annotations. ' 'A reduced version of orthogroups.txt with no annotations will be produced.' ) else: # ref_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes] # fetch_annotations_for_ids(config.annotations_dir, ref_ids) gb_ids = [ splitext(basename(prot_file))[0] for prot_file in proteomes ] log.debug('ids_list: ' + str(gb_ids)) res = fetch_annotations_for_ids( config.annotations_dir, gb_ids, p.proxy) if res > 0: return res if res == -1: p.download_anno = False return adjust_proteomes(proteomes, config.proteomes_dir, p.prot_id_field)
def run(start_from_here=False): assemblies = [ join(p.assemblies, f) for f in listdir(p.assemblies) if f and f[0] != '.' ] if isdir(config.proteomes_dir): assemblies = filter_dublicated_proteomes( config.proteomes_dir, assemblies) if assemblies == []: log.warn(all_considered_warning % config.proteomes_dir) exit(1) assembly_names = [splitext(basename(asm))[0] for asm in assemblies] filtered_assemblies = [ join(assemblies_dir, asm_name + '.fna') for asm_name in assembly_names ] new_proteomes = [ join(config.proteomes_dir, asm_name + '.fasta') for asm_name in assembly_names ] if not isdir(assemblies_dir): mkdir(assemblies_dir) log.debug(' Created assemblies_dir ' + assemblies_dir) total_successful_filters = 0 for assembly, filtered_asm in zip(assemblies, filtered_assemblies): if filter_assembly(assembly, filtered_asm, skip=(4, 7, 10, 23, 32, 38), skip_after=51) == 0: total_successful_filters += 1 if total_successful_filters == 0: log.error('No correct assemblies.') return 1 for asm, prot, asm_name in zip(filtered_assemblies, new_proteomes, assembly_names): res = cmdline('prodigal', parameters=[ '-i', asm, '-o', join(config.intermediate_dir, asm_name), '-a', prot ])() if res != 0: return res log.info('') res = adjust_proteomes(new_proteomes, config.proteomes_dir, prot_id_field=0) if res != 0: return res # Recreate new_proteomes_directory if exists(new_proteomes_dir): rmtree(new_proteomes_dir) if not isdir(new_proteomes_dir): mkdir(new_proteomes_dir) for prot in new_proteomes: copy(prot, join(new_proteomes_dir, basename(prot))) return 0