def match_fasta(self): files = myIO.dir_os(self.par['dir_out']).incrusive_files() #select a fasta file fa_files = filter(lambda x: x.endswith(('.fa', '.fasta')), files) self.par['match_fa'] = mySystem.system().select_key(fa_files) #select a gtf or gff file gtf_files = filter(lambda x: x.endswith(('.gtf', '.gff3')), files) self.par['match_gtf'] = mySystem.system().select_key(gtf_files) #match if par['web_site'] == 'ENSEML': myGenome.genome(par['match_fa']).match_ensembl_fa(par['match_gtf']) elif par['web_site'] == 'NCBI': myGenome.genome(par['match_fa']).match_ncbi_fa(par['match_gtf'])
def phipseq_alignment(self, sample_name): print('\n######Anslysis of {} will be trigerred!#####'.format( sample_name)) #initiate sample par sample_var = dict(self.par) sample_var['start_time'] = time.time() #sample name sample_var['sample_name'] = sample_name #sample directory sample_dir = self.par['sample_dirs'][sample_name] sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir() print('\tSample directory: ', sample_var['sample_dir']) #raw data sample_var['sample_raw_files'] = ','.join( sample_var['sample_to_raw'][sample_name]) print('\tRaw files: ', sample_var['sample_raw_files']) #export sample_var['file_head'] = sample_var['sample_dir'] + sample_name #default same file sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam' #file of read counts sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt' sample_var['sample_pro_sumRC_file'] = sample_var[ 'file_head'] + '_pro_sumRC.txt' sample_var['sample_pro_maxRC_file'] = sample_var[ 'file_head'] + '_pro_maxRC.txt' #file for saturation analysis sample_var['sample_saturation_file'] = sample_var[ 'file_head'] + '_saturation.txt' #sample log sample_var['sample_log'] = sample_var['file_head'] + '.log' #sequence alignment if sample_var['phip_alignment'] == 'yes': print("\n###sequence alignment", sample_var['tool_aligner']) #output is sam file if sample_var['tool_aligner'] == 'bowtie1': myAlign.alignment(sample_var).bowtie1_alignment() #counts reads if sample_var['phip_counting'] == 'yes': #RC matrix by peptides myAlign.alignment(sample_var).count_reads() #RC matrix by proteins if 'file_annotation' in self.par.keys(): self.combine_peptides(sample_var) #update sample log sample_times = mySystem.system().get_time(sample_var['start_time']) sample_times['sample_name'] = sample_name myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
def genome_annot(self): if par['web_site'] == 'ENSEML': #1: select data type data_types = ['dna_fa', 'genome_CDS', 'gtf', 'gff', 'protein'] self.par['data_type'] = mySystem.system().select_key(data_types) #2: download if par['data_type'] == 'dna_fa': myDownload.ensembl(par['specie'], par['dir_out']).download_dna() else: myDownload.ensembl(par['specie'], par['dir_out']).download_annot( par['data_type']) elif par['web_site'] == 'NCBI': #1: select data type data_types = ['dna_fa', 'RNA', 'gff', 'protein'] self.par['data_type'] = mySystem.system().select_key(data_types) #2: donwload if par['data_type'] == 'dna_fa': myDownload.NCBI(par['specie'], par['dir_out']).download_dna() else: myDownload.NCBI(par['specie'], par['dir_out']).download_annot( par['data_type'])
def download_idmapping(self): #get web file list url_idmapping = self.url+'knowledgebase/idmapping/by_organism/' web_dir, web_files = web(url_idmapping).ls_html() #print web_files #select file file_names = filter(lambda x: '.dat.' in x, web_files.values()) file_names.sort() file_name = mySystem.system().select_key(file_names, 'Select web file') #download idmapping dat file url_file = url_idmapping + file_name local_file = self.out_dir + file_name web(url_file).download_file(local_file) #decompress file ungz_file = myIO.file_os(local_file).decompress_gz() print('Save ', url_file, ' as ', ungz_file) return ungz_file
#check python version if int(sys.version[0]) <= 2: print(sys.version) print( '\nError: The version of python required for the pipeline running is at least v3.4.~\n' ) sys.exits(2) ######################################## #read variables.txt #var_file = '/home/yuan/rawdata_phip/phipseq17_virus_variables.txt' var_file = os.path.abspath(sys.argv[1]) #print(var_file) par = myIO.file_os(var_file, '=').to_dict() par['file_var'] = var_file #initiate parameters, directories and files par = launch_phip(par).init_par() ###################################### #main loop bioPHIPfunc.phip(par).main_loop() ###################################### #end times_dict = mySystem.system().get_time(start_time) myIO.file_os(par['file_total_log'], '=').line_replace(times_dict) print('\n\nDuration: ', times_dict['duration']) print('\n\nGreat! It is done!!!\n\n\n')
#initiate dictionary saving parameters par = { 'in_out': 'Continue' } annot = download_annot(par) ######################################## # #2: download dir par['dir_home'] = myIO.dir_os('/home/yuan/data_preparation/').stdin_dir( 'Enter the directory path storing downloads files') print par['dir_home'] while (par['in_out'] == 'Continue'): #2:select ftp or web site web_sites = ['NCBI', 'ENSEML', 'UniProt'] par['web_site'] = mySystem.system().select_key( web_sites, 'Select public database') par['dir_out'] = par['dir_home'] + par['web_site'] + '/' #1: select file types if par['web_site'] in ['NCBI', 'ENSEML']: operations = ['Genome annotation', 'match fasta and gtf'] par['operations'] = mySystem.system().select_key( operations, 'What is your operations') elif par['web_site'] == 'UniProt': par['operations'] = 'UniProt idmapping' #download genome annotations if par['operations'] == 'Genome annotation': #2: select specie species = ['human', 'mouse', 'rat', 'maize'] par['specie'] = mySystem.system().select_key( species, 'Select specie of genome')