def main(args): with open(args.inputfile) as f: k = read_k(args.alphabet, args.method, 0) # Get index_list. if args.i is not None: from pse import read_index ind_list = read_index(args.i) else: ind_list = [] default_e = [] # Set Pse default index_list. if args.alphabet == 'DNA': args.alphabet = index_list.DNA if k == 2: default_e = const.DI_INDS_6_DNA elif k == 3: default_e = const.TRI_INDS_DNA elif args.alphabet == 'RNA': args.alphabet = index_list.RNA default_e = const.DI_INDS_RNA elif args.alphabet == 'Protein': args.alphabet = index_list.PROTEIN default_e = const.INDS_3_PROTEIN theta_type = 1 if args.method in const.METHODS_AC: theta_type = 1 elif args.method in const.METHODS_CC: theta_type = 2 elif args.method in const.METHODS_ACC: theta_type = 3 else: print("Method error!") # ACC. if args.e is None and len(ind_list) == 0 and args.a is False: # Default Pse. res = acc(f, k, args.lag, default_e, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) else: res = acc(f, k, args.lag, ind_list, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) # Write correspond res file. if args.f == 'tab': from util import write_tab write_tab(res, args.outputfile) elif args.f == 'svm': from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == 'csv': from util import write_csv write_csv(res, args.outputfile)
"Protein: Hydrophobicity, Hydrophilicity, Mass.") parse.add_argument('-e', help="The user-defined indices file.\n") parse.add_argument('-all_index', dest='a', action='store_true', help="Choose all physicochemical indices") parse.add_argument('-no_all_index', dest='a', action='store_false', help="Do not choose all physicochemical indices, default.") parse.set_defaults(a=False) parse.add_argument('-f', default='tab', choices=['tab', 'svm', 'csv'], help="The output format (default = tab).\n" "tab -- Simple format, delimited by TAB.\n" "svm -- The libSVM training data format.\n" "csv -- The format that can be loaded into a spreadsheet program.") parse.add_argument('-l', default='+1', choices=['+1', '-1'], help="The libSVM output file label.") args = parse.parse_args() args.k = read_k(args.alphabet, args.method, args.k) # print(args) if check_args(args, 'pse.py'): print("Calculating...") start_time = time.time() main(args) print("Done.") print("Used time: %ss" % (time.time() - start_time)) # Test dna type1. # print("Test di_dna, type1.") # alphabet = index_list.DNA # res = pseknc(input_data=['GACTGAACTGCACTTTGGTTTCATATTATTTGCTC'], k=2, w=0.5, lamada=1, # phyche_list=['Tilt', 'Roll', 'Rise', 'Slide', 'Shift'], # extra_index_file="data/test_ext_dna.txt", alphabet=alphabet)
def main(args): """The main process of autocorrelation methods. :param args: an object of the arguments. """ file_list = args.inputfiles label_list = args.labels output_format = args.f if len(file_list) == 0: print('Input files not found.') return False if output_format == 'svm' and len(label_list) == 0: print('The labels of the input files should be set.') return False if output_format == 'svm' and len(file_list) != len(label_list): print( 'The number of labels should be the same as that of the input files.' ) return False if args.out is not None: outputfile_list = args.out if len(outputfile_list) != len(file_list): print( 'The number of output files should be the same as that of input files.' ) return False elif args.out is None: outputfile_list = [] if output_format == 'svm': for in_file_name in file_list: file_elem_list = list(os.path.splitext(in_file_name)) out_name = file_elem_list[0] + '_svm' + file_elem_list[1] outputfile_list.append(out_name) elif output_format == 'tab': for in_file_name in file_list: file_elem_list = list(os.path.splitext(in_file_name)) out_name = file_elem_list[0] + '_tab' + file_elem_list[1] outputfile_list.append(out_name) elif output_format == 'csv': for in_file_name in file_list: file_elem_list = list(os.path.splitext(in_file_name)) out_name = file_elem_list[0] + '_csv' + file_elem_list[1] outputfile_list.append(out_name) if output_format != 'svm': label_list = [0] * len(file_list) if args.method.upper() not in ['MAC', 'GAC', 'NMBAC', 'PDT']: for input_file, output_file, label in zip(file_list, outputfile_list, label_list): with open(input_file) as f: k = read_k(args.alphabet, args.method, 0) # Get index_list. if args.i is not None: from pse import read_index ind_list = read_index(args.i) else: ind_list = [] default_e = [] # Set Pse default index_list. if args.alphabet == 'DNA': alphabet_list = index_list.DNA if k == 2: default_e = const.DI_INDS_6_DNA elif k == 3: default_e = const.TRI_INDS_DNA elif args.alphabet == 'RNA': alphabet_list = index_list.RNA default_e = const.DI_INDS_RNA elif args.alphabet == 'Protein': alphabet_list = index_list.PROTEIN default_e = const.INDS_3_PROTEIN else: print('The alphabet should be DNA, RNA or Protein.') return False theta_type = 1 if args.method in const.METHODS_AC: theta_type = 1 elif args.method in const.METHODS_CC: theta_type = 2 elif args.method in const.METHODS_ACC: theta_type = 3 else: print("Method error!") # ACC. if args.e is None and len(ind_list) == 0 and args.a is False: # Default Pse. res = acc(f, k, args.lag, default_e, alphabet_list, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) else: res = acc(f, k, args.lag, ind_list, alphabet_list, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) write_to_file(res, output_format, label, output_file) if args.method.upper() in ['MAC', 'GAC', 'NMBAC']: if args.lamada < 0 or args.lamada > 10: print( 'The value of lamada should be larger than 0 and smaller than 10.' ) return False if args.a is None: args.a == False elif args.alphabet == 'DNA': args.alphabet = index_list.DNA if args.oli == 0: if args.a == True: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = autocorrelation(autoc=args.method, inputfile=input_file, props=const.ALL_DI_DNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) write_to_file(res, output_format, label, output_file) elif args.a == False: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = autocorrelation(autoc=args.method, inputfile=input_file, props=const.DEFAULT_DI_DNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) write_to_file(res, output_format, label, output_file) if args.oli == 1: if args.a == True: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = autocorrelation(autoc=args.method, inputfile=input_file, props=const.ALL_TRI_DNA_IND, k=3, l=args.lamada, alphabet=args.alphabet) write_to_file(res, output_format, label, output_file) elif args.a == False: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = autocorrelation(autoc=args.method, inputfile=input_file, props=const.DEFAULT_TRI_DNA_IND, k=3, l=args.lamada, alphabet=args.alphabet) write_to_file(res, output_format, label, output_file) elif args.alphabet == 'RNA': args.alphabet = index_list.RNA if args.a == True: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = autocorrelation(autoc=args.method, inputfile=input_file, props=const.ALL_RNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) write_to_file(res, output_format, label, output_file) elif args.a == False: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = autocorrelation(autoc=args.method, inputfile=input_file, props=const.DEFAULT_RNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) write_to_file(res, output_format, label, output_file) if args.method.upper() == 'PDT': if args.alphabet != 'Protein': print('PDT method is only available for Protein sequences.') return False else: if args.lamada < 1 or args.lamada > 15: print( 'The value of -lamada should be larger than 0 and smaller than 16.' ) return False else: for input_file, output_file, label in zip( file_list, outputfile_list, label_list): res = pdt(input_file, args.lamada) write_to_file(res, output_format, label, output_file) if len(outputfile_list) != 0: for index, output_file in enumerate(outputfile_list): out_with_full_path = os.path.abspath(output_file) if os.path.isfile(out_with_full_path): if index == 0: print('The output file(s) can be found here:') print(out_with_full_path)
def main(args): with open(args.inputfile) as f: k = read_k(args.alphabet, args.method, 0) # Get index_list. if args.i is not None: from pse import read_index ind_list = read_index(args.i) else: ind_list = [] default_e = [] # Set Pse default index_list. if args.alphabet == "DNA": args.alphabet = index_list.DNA if k == 2: default_e = const.DI_INDS_6_DNA elif k == 3: default_e = const.TRI_INDS_DNA elif args.alphabet == "RNA": args.alphabet = index_list.RNA default_e = const.DI_INDS_RNA elif args.alphabet == "Protein": args.alphabet = index_list.PROTEIN default_e = const.INDS_3_PROTEIN theta_type = 1 if args.method in const.METHODS_AC: theta_type = 1 elif args.method in const.METHODS_CC: theta_type = 2 elif args.method in const.METHODS_ACC: theta_type = 3 else: print("Method error!") # ACC. if args.e is None and len(ind_list) == 0 and args.a is False: # Default Pse. res = acc( f, k, args.lag, default_e, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type, ) else: res = acc( f, k, args.lag, ind_list, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type ) # Write correspond res file. if args.f == "tab": from util import write_tab write_tab(res, args.outputfile) elif args.f == "svm": from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == "csv": from util import write_csv write_csv(res, args.outputfile)
action='store_false', help="Do not choose all physicochemical indices, default.") parse.set_defaults(a=False) parse.add_argument( '-f', default='tab', choices=['tab', 'svm', 'csv'], help="The output format (default = tab).\n" "tab -- Simple format, delimited by TAB.\n" "svm -- The libSVM training data format.\n" "csv -- The format that can be loaded into a spreadsheet program.") parse.add_argument( '-labels', nargs='*', help="The labels of the input files.\n" "For binary classification problem, the labels can only be '+1' or '-1'.\n" "For multiclass classification problem, the labels can be set as a list of integers." ) args = parse.parse_args() args.k = read_k(args.alphabet, args.method, args.k) # print(args) if check_args(args, 'pse.py'): print("Calculating...") start_time = time.time() main(args) print("Done.") print("Used time: %.2fs" % (time.time() - start_time))
def main(args): with open(args.inputfile) as f: if args.method.upper() not in ['MAC', 'GAC', 'NMBAC']: k = read_k(args.alphabet, args.method, 0) # Get index_list. if args.i is not None: from .pse import read_index ind_list = read_index(args.i) else: ind_list = [] default_e = [] # Set Pse default index_list. if args.alphabet == 'DNA': args.alphabet = index_list.DNA if k == 2: default_e = const.DI_INDS_6_DNA elif k == 3: default_e = const.TRI_INDS_DNA elif args.alphabet == 'RNA': args.alphabet = index_list.RNA default_e = const.DI_INDS_RNA elif args.alphabet == 'Protein': args.alphabet = index_list.PROTEIN default_e = const.INDS_3_PROTEIN theta_type = 1 if args.method in const.METHODS_AC: theta_type = 1 elif args.method in const.METHODS_CC: theta_type = 2 elif args.method in const.METHODS_ACC: theta_type = 3 else: print("Method error!") # ACC. if args.e is None and len(ind_list) == 0 and args.a is False: # Default Pse. res = acc(f, k, args.lag, default_e, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) else: res = acc(f, k, args.lag, ind_list, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) if args.method.upper() in ['MAC', 'GAC', 'NMBAC']: if args.lamada < 0 or args.lamada > 10: print( 'The value of lamada should be larger than 0 and smaller than 10.' ) return False if args.a is None: args.a == False elif args.alphabet == 'DNA': args.alphabet = index_list.DNA if args.oli == 0: if args.a == True: res = autocorrelation(autoc=args.method, inputfile=args.inputfile, props=const.ALL_DI_DNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) elif args.a == False: res = autocorrelation(autoc=args.method, inputfile=args.inputfile, props=const.DEFAULT_DI_DNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) if args.oli == 1: if args.a == True: res = autocorrelation(autoc=args.method, inputfile=args.inputfile, props=const.ALL_TRI_DNA_IND, k=3, l=args.lamada, alphabet=args.alphabet) elif args.a == False: res = autocorrelation(autoc=args.method, inputfile=args.inputfile, props=const.DEFAULT_TRI_DNA_IND, k=3, l=args.lamada, alphabet=args.alphabet) elif args.alphabet == 'RNA': args.alphabet = index_list.RNA if args.a == True: res = autocorrelation(autoc=args.method, inputfile=args.inputfile, props=const.ALL_RNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) elif args.a == False: res = autocorrelation(autoc=args.method, inputfile=args.inputfile, props=const.DEFAULT_RNA_IND, k=2, l=args.lamada, alphabet=args.alphabet) #print res # Write correspond res file. if args.f == 'tab': from util import write_tab write_tab(res, args.outputfile) elif args.f == 'svm': if args.multi == 0 and args.l is None: args.l = '+1' elif args.multi == 0 and (args.l != '+1' and args.l != '-1'): print( "For binary classification, the label should be either '+1' or '-1'." ) return False elif args.multi == 1 and args.l is None: args.l = '0' elif args.multi == 1 and args.l is not None: try: label = int(args.l) except ValueError: print('The labels should be integer.') return False from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == 'csv': from util import write_csv write_csv(res, args.outputfile)
def main(args): names=[] with open(args.inputfile) as af: seq_list=read_fasta(af) for e in seq_list: names.append(e.name) #kmer.py -f tab -l +1 -r 1 -k ',num2str(k),' TAIR10_DHSs.fas TAIR10_DHSs_reckmer_',num2str(k),'.txt DNA' res_kmer = make_kmer_vector(k=2, alphabet=index_list.DNA, filename=args.inputfile, revcomp=True) #acc.py -e user_indices.txt -f svm -l +1 -lag ',num2str(lag),' TAIR10_DHSs.fas TAIR10_DHSs_dac_',num2str(lag),'.txt DNA DAC' if args.s==0: model_file='pDHSdata_TAIR_model.txt' lag=3 else: model_file='pDHSdata_TIGR_model.txt' lag=8 with open(args.inputfile) as f: k = read_k('DNA', 'DAC', 0) ind_list=[] res_acc = acc(f, k, lag, ind_list, index_list.DNA, extra_index_file='user_indices.txt', all_prop=False, theta_type=1) # features= revckmer+dac,formed by add each row res=[] for i in range(len(res_kmer)): res.append(res_kmer[i]+res_acc[i]) featuresfile=args.inputfile+'_tmp_features.txt' # Write correspond res file. from util import write_libsvm write_libsvm(res, ['+1'] * len(res), featuresfile) #predict the result tmp_predict_result_file=args.inputfile+'_tmp_result.txt' if sys.platform == 'win32': options='svm-predict -b 1 -q '+featuresfile+' '+model_file+' '+ tmp_predict_result_file else: options='./svm-predict -b 1 -q '+featuresfile+' '+model_file+' '+ tmp_predict_result_file os.system(options) pf=open(args.outputfile,'w') with open(tmp_predict_result_file) as nf: label, TrueProb, FalseProb= '', '','' count = 0 while True: line = nf.readline().strip() if not line: break if count>len(names): break if 0==count: pf.write('ID\t\tLabel\t\tProb\n') count+=1 continue label=int(line.split()[0]) TrueProb=line.split()[1] FalseProb=line.split()[2] if label==-1: pf.write(names[count-1]+'\t\t'+'Non DHS'+'\t\t'+str(FalseProb)+'\n') else: pf.write(names[count-1]+'\t\t'+'DHS'+'\t\t'+str(TrueProb)+'\n') count+=1 pf.close() cwd = os.getcwd() files = [x for x in os.listdir(os.getcwd()) if os.path.isfile(os.path.join(cwd,x))] #print files for file in files: if -1 != file.find('tmp'): os.remove(file)
def GetVariousClassFeatures(samples_file, path): isExists = os.path.exists(path) if not isExists: os.makedirs(path) fp = open(samples_file, 'r') sample = GetSequences(fp, 'ACGT') instances = array(sample) print('The number of samples: %d' % (len(sample))) # 1 Spectrum Profile for k=1,2,3,4,5 for k in range(1, 6): tic = time.clock() X = GetSpectrumProfile(k, samples_file) np.savetxt(path + str(k) + '-SpectrumProfile.txt', X) toc = time.clock() print('Coding time for ' + str(k) + '-Spectrum Profile:%.3f minutes' % ((toc - tic) / 60)) # 2 Mismatch Profile for (k,m)=(3,1),(4,1),(5,1) for (k, m) in [(3, 1), (4, 1), (5, 1)]: tic = time.clock() X = GetMismatchProfile(instances, alphabet, k, m) np.savetxt(path + str((k, m)) + '-MismatchProfile.txt', X) toc = time.clock() print('Coding time for ' + str((k, m)) + '-Mismatch Profile:%.3f minutes' % ((toc - tic) / 60)) # 3 Reverse Compliment Kmer for k=1,2,3,4,5 for k in range(1, 6): tic = time.clock() X = GetRevcKmer(k) np.savetxt(path + str(k) + '-RevcKmer.txt', X) toc = time.clock() print('Coding time for ' + str(k) + '-RevcKmer:%.3f minutes' % ((toc - tic) / 60)) # 4 Parallel Correlation Pseudo Dinucleotide Composition tic = time.clock() X = GetPCPseDNC(3, 0.9) #(2, 0.2) np.savetxt(path + 'PCPseDNC.txt', X) toc = time.clock() print('Coding time for PCPseDNC:%.3f minutes' % ((toc - tic) / 60)) # 5 Parallel Correlation Pseudo Trinucleotide Composition tic = time.clock() X = GetPCPseTNC(3, 0.5) #(6, 0.1) np.savetxt(path + 'PCPseTNC.txt', X) toc = time.clock() print('Coding time for PCPseTNC:%.3f minutes' % ((toc - tic) / 60)) # 6 Series Correlation Pseudo Dinucleotide Composition tic = time.clock() X = GetSCPseDNC(5, 0.1) #(1, 0.1) np.savetxt(path + 'SCPseDNC.txt', X) toc = time.clock() print('Coding time for SCPseDNC:%.3f minutes' % ((toc - tic) / 60)) # 7 Series Correlation Pseudo Trinucleotide Composition tic = time.clock() X = GetSCPseTNC(10, 0.1) #(6, 0.1) np.savetxt(path + 'SCPseTNC.txt', X) toc = time.clock() print('Coding time for SCPseTNC:%.3f minutes' % ((toc - tic) / 60)) # 8 Dinucleotide-based auto covariance tic = time.clock() k = read_k('DNA', 'DAC', 0) # X = GetDAC(instances, k, 3, alphabet) X = GetDAC(instances, k, 8, alphabet) np.savetxt(path + 'DAC.txt', X) toc = time.clock() print('Coding time for DAC:%.3f minutes' % ((toc - tic) / 60)) f0 = open((path + 'SpectrumProfile.txt'), 'w+') f1 = open((path + '1-SpectrumProfile.txt'), 'r') f2 = open((path + '2-SpectrumProfile.txt'), 'r') f3 = open((path + '3-SpectrumProfile.txt'), 'r') f4 = open((path + '4-SpectrumProfile.txt'), 'r') f5 = open((path + '5-SpectrumProfile.txt'), 'r') while True: F = [] s1 = f1.readline() s1 = s1.strip('\n') F.extend(s1 + ' ') s2 = f2.readline() s2 = s2.strip('\n') F.extend(s2 + ' ') s3 = f3.readline() s3 = s3.strip('\n') F.extend(s3 + ' ') s4 = f4.readline() s4 = s4.strip('\n') F.extend(s4 + ' ') s5 = f5.readline() if not s5: break s5 = s5.strip('\n') F.extend(s5 + ' ') result = ''.join(F) f0.write(result + '\n') f1.close() f2.close() f3.close() f4.close() f5.close() f0.close() f0 = open((path + 'MismatchProfile.txt'), 'w+') f1 = open((path + '(3, 1)-MismatchProfile.txt'), 'r') f2 = open((path + '(4, 1)-MismatchProfile.txt'), 'r') f3 = open((path + '(5, 1)-MismatchProfile.txt'), 'r') while True: F = [] s1 = f1.readline() s1 = s1.strip('\n') F.extend(s1 + ' ') s2 = f2.readline() s2 = s2.strip('\n') F.extend(s2 + ' ') s3 = f3.readline() if not s3: break s3 = s3.strip('\n') F.extend(s3 + ' ') result = ''.join(F) f0.write(result + '\n') f1.close() f2.close() f3.close() f0.close() f0 = open((path + 'RevcKmer.txt'), 'w+') f1 = open((path + '1-RevcKmer.txt'), 'r') f2 = open((path + '2-RevcKmer.txt'), 'r') f3 = open((path + '3-RevcKmer.txt'), 'r') f4 = open((path + '4-RevcKmer.txt'), 'r') f5 = open((path + '5-RevcKmer.txt'), 'r') while True: F = [] s1 = f1.readline() s1 = s1.strip('\n') F.extend(s1 + ' ') s2 = f2.readline() s2 = s2.strip('\n') F.extend(s2 + ' ') s3 = f3.readline() s3 = s3.strip('\n') F.extend(s3 + ' ') s4 = f4.readline() s4 = s4.strip('\n') F.extend(s4 + ' ') s5 = f5.readline() if not s5: break s5 = s5.strip('\n') F.extend(s5 + ' ') result = ''.join(F) f0.write(result + '\n') f1.close() f2.close() f3.close() f4.close() f5.close() f0.close() f0 = open((path + 'Pse.txt'), 'w+') f1 = open((path + 'PCPseDNC.txt'), 'r') f2 = open((path + 'PCPseTNC.txt'), 'r') f3 = open((path + 'SCPseDNC.txt'), 'r') f4 = open((path + 'SCPseTNC.txt'), 'r') while True: F = [] s1 = f1.readline() s1 = s1.strip('\n') F.extend(s1 + ' ') s2 = f2.readline() s2 = s2.strip('\n') F.extend(s2 + ' ') s3 = f3.readline() s3 = s3.strip('\n') F.extend(s3 + ' ') s4 = f4.readline() if not s4: break s4 = s4.strip('\n') F.extend(s4 + ' ') result = ''.join(F) f0.write(result + '\n') f1.close() f2.close() f3.close() f4.close() f0.close()