def compare(header_lst, seq_lst, substitution_mat, fname): """ Command line wrapper for the comparison of all sequences with the first one When the --compare option is given to the command line, the program compares all the sequences to the first one and writes these comparison as sequences of digits. These digits represent the distance between the PB in the target and the one in the reference at the same position. The digits are normalized in the [0; 9] range. This function run the comparison, write the result in a fasta file, and display on screen informations about the process. Parameters ---------- header_lst: list of strings The list of sequence headers ordered as the sequences seq_lst: list of strings The list of sequences ordered as the headers substitution_mat: numpy.array A substitution matrix expressed as similarity scores fname: str The output file name """ ref_name = header_lst[0] substitution_mat_modified = PB.matrix_to_single_digit(substitution_mat) print("Normalized substitution matrix (between 0 and 9)") print(substitution_mat_modified) print("Compare first sequence ({0}) with others".format(ref_name)) with open(fname, 'w') as outfile: for header, score_lst in PB.compare_to_first_sequence( header_lst, seq_lst, substitution_mat_modified): seq = "".join([str(s) for s in score_lst]) PB.write_fasta_entry(outfile, seq, header) print("wrote {0}".format(fname))
def compare(header_lst, seq_lst, substitution_mat, fname): """ Command line wrapper for the comparison of all sequences with the first one When the --compare option is given to the command line, the program compares all the sequences to the first one and writes these comparison as sequences of digits. These digits represent the distance between the PB in the target and the one in the reference at the same position. The digits are normalized in the [0; 9] range. This function run the comparison, write the result in a fasta file, and display on screen informations about the process. Parameters ---------- header_lst: list of strings The list of sequence headers ordered as the sequences seq_lst: list of strings The list of sequences ordered as the headers substitution_mat: numpy.array A substitution matrix expressed as similarity scores fname: str The output file name """ ref_name = header_lst[0] substitution_mat_modified = PB.matrix_to_single_digit(substitution_mat) print("Normalized substitution matrix (between 0 and 9)") print(substitution_mat_modified) print("Compare first sequence ({0}) with others".format(ref_name)) with open(fname, 'w') as outfile: for header, score_lst in PB.compare_to_first_sequence(header_lst, seq_lst, substitution_mat_modified): seq = "".join([str(s) for s in score_lst]) PB.write_fasta_entry(outfile, seq, header) print("wrote {0}".format(fname))
def test_read_fasta(self): headers, sequences = PB.read_fasta("test_data/1BTA.pdb.PB.fasta") self.assertEqual(headers, ["test_data/1BTA.pdb | chain A"]) self.assertEqual( sequences, ["ZZdddfklonbfklmmmmmmmmnopafklnoiakl" "mmmmmnoopacddddddehkllmmmmngoilmmmm" "mmmmmmmmnopacdcddZZ"], )
def pbclust_cli(): """ Run the PBclust command line """ # Read user inputs options = user_input() header_lst, seq_lst = PB.read_several_fasta(options.f) # Load subtitution matrix try: substitution_mat = PB.load_substitution_matrix( PB.SUBSTITUTION_MATRIX_NAME) except ValueError: sys.exit("Substitution matrix is not symetric.") except IOError: sys.exit("Error reading the substitution matrix.") # --compare option # compare the first sequence (in the fasta file) versus all others if options.compare: compare_file_name = options.o + ".PB.compare.fasta" compare(header_lst, seq_lst, substitution_mat, compare_file_name) sys.exit(0) # Compute the distance matrix for the clustering try: distance_mat = PB.distance_matrix(seq_lst, substitution_mat) except PB.InvalidBlockError as e: sys.exit('Unexpected PB in the input ({})'.format(e.block)) distance_fname = options.o + ".PB.dist" write_distance_matrix(distance_mat, distance_fname) print("wrote {0}".format(distance_fname)) # Carry out the clustering try: cluster_id, medoid_id = PB.hclust(distance_mat, nclusters=options.clusters) except PB.RError as e: sys.exit('Error with R:\n' + str(e)) display_clust_report(cluster_id) output_fname = options.o + ".PB.clust" write_clusters(output_fname, cluster_id, medoid_id, header_lst) print("wrote {0}".format(output_fname))
def pbclust_cli(): """ Run the PBclust command line """ # Read user inputs options = user_input() header_lst, seq_lst = PB.read_several_fasta(options.f) # Load subtitution matrix try: substitution_mat = PB.load_substitution_matrix(PB.SUBSTITUTION_MATRIX_NAME) except ValueError: sys.exit("Substitution matrix is not symetric.") except IOError: sys.exit("Error reading the substitution matrix.") # --compare option # compare the first sequence (in the fasta file) versus all others if options.compare: compare_file_name = options.o + ".PB.compare.fasta" compare(header_lst, seq_lst, substitution_mat, compare_file_name) sys.exit(0) # Compute the distance matrix for the clustering try: distance_mat = PB.distance_matrix(seq_lst, substitution_mat) except PB.InvalidBlockError as e: sys.exit('Unexpected PB in the input ({})'.format(e.block)) distance_fname = options.o + ".PB.dist" write_distance_matrix(distance_mat, distance_fname) print("wrote {0}".format(distance_fname)) # Carry out the clustering try: cluster_id, medoid_id = PB.hclust(distance_mat, nclusters=options.clusters) except PB.RError as e: sys.exit('Error with R:\n' + str(e)) display_clust_report(cluster_id) output_fname = options.o + ".PB.clust" write_clusters(output_fname, cluster_id, medoid_id, header_lst) print("wrote {0}".format(output_fname))
def pbassign_cli(): """ PBassign command line. """ options, pdb_name_lst = user_inputs() if options.p: if pdb_name_lst: print("{} PDB file(s) to process".format(len(pdb_name_lst))) else: print('Nothing to do. Good bye.') return # PB assignement of PDB structures chains = PDB.chains_from_files(pdb_name_lst) else: # PB assignement of a Gromacs trajectory chains = PDB.chains_from_trajectory(options.x, options.g) all_comments = [] all_sequences = [] all_dihedrals = [] for comment, chain in chains: dihedrals = chain.get_phi_psi_angles() sequence = PB.assign(dihedrals) all_comments.append(comment) all_dihedrals.append(dihedrals) all_sequences.append(sequence) fasta_name = options.o + ".PB.fasta" with open(fasta_name, 'w') as outfile: PB.write_fasta(outfile, all_sequences, all_comments) if options.flat: flat_name = options.o + ".PB.flat" with open(flat_name, 'w') as outfile: PB.write_flat(outfile, all_sequences) if options.phipsi: phipsi_name = options.o + ".PB.phipsi" with open(phipsi_name, 'w') as outfile: PB.write_phipsi(outfile, all_dihedrals, all_comments) print("wrote {0}".format(fasta_name)) if options.flat: print("wrote {0}".format(flat_name)) if options.phipsi: print("wrote {0}".format(phipsi_name))
def test_count_to_transfac(self): """ Test if the count_to_transfac function works. """ ref_input = [ " a b c d\n", # header "1 0 0 0 0\n", "2 2 789 99999 89\n", # one value is written # on 5 characters "3 99999 8888 2 2\n", # the first value is # written on 5 characters "4 99 0 999999 0\n", # one value is written # on more than 5 # characters "5 0 0 0 0\n", ] identifier = "identifier" ref_output = ( "ID identifier\n" "BF unknown\n" "P0 a b c d\n" "00001 0 0 0 0 X\n" "00002 2 789 99999 89 X\n" "00003 99999 8888 2 2 X\n" "00004 99 0 999999 0 X\n" "00005 0 0 0 0 X\n" "XX\n" "//" ) output = PB.count_to_transfac(identifier, ref_input) ref_output_lines = ref_output.split("\n") output_lines = output.split("\n") self.assertEqual(len(ref_output_lines), len(output_lines), "Not the right number of lines") for ref_line, line in zip(ref_output_lines, output_lines): print("ref:", ref_line) print("out:", line) self.assertEqual(ref_line, line)
# G. E. Crooks, G. Hon, J.-M. Chandonia, and S. E. Brenner. # 'WebLogo: A Sequence Logo Generator.' # Genome Research 14:1188–90 (2004) # doi:10.1101/gr.849004. # http://weblogo.threeplusone.com/ #------------------------------------------------------------------------------- if options.logo: # read count file #------------------------------------------------------------------------------- f_in = open(options.f, 'r') count_content = f_in.readlines() f_in.close() # convert a table of PB frequencies into transfac format as required by weblogo # http://meme.sdsc.edu/meme/doc/transfac-format.html transfac_content = PB.count_to_transfac(options.f, count_content) # write transfac file (debug only) #------------------------------------------------------------------------------- debug = False if debug: transfac_name = options.o + ".PB.transfac" f_out = open(transfac_name, 'w') f_out.write(transfac_content) f_out.close() print("wrote {0}".format(transfac_name)) # define output file name #------------------------------------------------------------------------------- logo_file_name = options.o + ".PB.logo.pdf" if options.residue_min or options.residue_max:
else: index_first_frame = options.first_frame-1 #------------------------------------------------------------------------------- # check input files #------------------------------------------------------------------------------- for name in options.f: if not os.path.isfile(name): sys.exit("%s does not appear to be a valid file.\nBye" % name) #------------------------------------------------------------------------------- # read PBs files #------------------------------------------------------------------------------- pb_seq = [] for name in options.f: pb_seq += PB.read_fasta(name)[1][index_first_frame:] #------------------------------------------------------------------------------- # check all sequences have the same size #------------------------------------------------------------------------------- pb_seq_size = len(pb_seq[0]) for seq in pb_seq: if len(seq) != pb_seq_size: sys.exit("cannot compute PB frequencies / different sequence lengths") #------------------------------------------------------------------------------- # count PBs at each position of the sequence #------------------------------------------------------------------------------- pb_count = numpy.zeros((pb_seq_size, len(PB.NAMES))) for seq in pb_seq:
# G. E. Crooks, G. Hon, J.-M. Chandonia, and S. E. Brenner. # 'WebLogo: A Sequence Logo Generator.' # Genome Research 14:1188–90 (2004) # doi:10.1101/gr.849004. # http://weblogo.threeplusone.com/ # ------------------------------------------------------------------------------- if options.logo: # read count file # ------------------------------------------------------------------------------- f_in = open(options.f, "r") count_content = f_in.readlines() f_in.close() # convert a table of PB frequencies into transfac format as required by weblogo # http://meme.sdsc.edu/meme/doc/transfac-format.html transfac_content = PB.count_to_transfac(options.f, count_content) # write transfac file (debug only) # ------------------------------------------------------------------------------- debug = False if debug: transfac_name = options.o + ".PB.transfac" f_out = open(transfac_name, "w") f_out.write(transfac_content) f_out.close() print("wrote {0}".format(transfac_name)) # define output file name # ------------------------------------------------------------------------------- logo_file_name = options.o + ".PB.logo.pdf" if options.residue_min or options.residue_max: