Example #1
0
def compare(header_lst, seq_lst, substitution_mat, fname):
    """
    Command line wrapper for the comparison of all sequences with the first one

    When the --compare option is given to the command line, the program
    compares all the sequences to the first one and writes these comparison as
    sequences of digits. These digits represent the distance between the PB
    in the target and the one in the reference at the same position. The digits
    are normalized in the [0; 9] range.

    This function run the comparison, write the result in a fasta file, and
    display on screen informations about the process.

    Parameters
    ----------
    header_lst: list of strings
        The list of sequence headers ordered as the sequences
    seq_lst: list of strings
        The list of sequences ordered as the headers
    substitution_mat: numpy.array
        A substitution matrix expressed as similarity scores
    fname: str
        The output file name
    """
    ref_name = header_lst[0]
    substitution_mat_modified = PB.matrix_to_single_digit(substitution_mat)
    print("Normalized substitution matrix (between 0 and 9)")
    print(substitution_mat_modified)
    print("Compare first sequence ({0}) with others".format(ref_name))
    with open(fname, 'w') as outfile:
        for header, score_lst in PB.compare_to_first_sequence(
                header_lst, seq_lst, substitution_mat_modified):
            seq = "".join([str(s) for s in score_lst])
            PB.write_fasta_entry(outfile, seq, header)
    print("wrote {0}".format(fname))
Example #2
0
def compare(header_lst, seq_lst, substitution_mat, fname):
    """
    Command line wrapper for the comparison of all sequences with the first one

    When the --compare option is given to the command line, the program
    compares all the sequences to the first one and writes these comparison as
    sequences of digits. These digits represent the distance between the PB
    in the target and the one in the reference at the same position. The digits
    are normalized in the [0; 9] range.

    This function run the comparison, write the result in a fasta file, and
    display on screen informations about the process.

    Parameters
    ----------
    header_lst: list of strings
        The list of sequence headers ordered as the sequences
    seq_lst: list of strings
        The list of sequences ordered as the headers
    substitution_mat: numpy.array
        A substitution matrix expressed as similarity scores
    fname: str
        The output file name
    """
    ref_name = header_lst[0]
    substitution_mat_modified = PB.matrix_to_single_digit(substitution_mat)
    print("Normalized substitution matrix (between 0 and 9)")
    print(substitution_mat_modified)
    print("Compare first sequence ({0}) with others".format(ref_name))
    with open(fname, 'w') as outfile:
        for header, score_lst in PB.compare_to_first_sequence(header_lst, seq_lst,
                                                              substitution_mat_modified):
            seq = "".join([str(s) for s in score_lst])
            PB.write_fasta_entry(outfile, seq, header)
    print("wrote {0}".format(fname))
Example #3
0
 def test_read_fasta(self):
     headers, sequences = PB.read_fasta("test_data/1BTA.pdb.PB.fasta")
     self.assertEqual(headers, ["test_data/1BTA.pdb | chain A"])
     self.assertEqual(
         sequences,
         ["ZZdddfklonbfklmmmmmmmmnopafklnoiakl" "mmmmmnoopacddddddehkllmmmmngoilmmmm" "mmmmmmmmnopacdcddZZ"],
     )
Example #4
0
def pbclust_cli():
    """
    Run the PBclust command line
    """
    # Read user inputs
    options = user_input()
    header_lst, seq_lst = PB.read_several_fasta(options.f)

    # Load subtitution matrix
    try:
        substitution_mat = PB.load_substitution_matrix(
            PB.SUBSTITUTION_MATRIX_NAME)
    except ValueError:
        sys.exit("Substitution matrix is not symetric.")
    except IOError:
        sys.exit("Error reading the substitution matrix.")

    # --compare option
    # compare the first sequence (in the fasta file) versus all others
    if options.compare:
        compare_file_name = options.o + ".PB.compare.fasta"
        compare(header_lst, seq_lst, substitution_mat, compare_file_name)
        sys.exit(0)

    # Compute the distance matrix for the clustering
    try:
        distance_mat = PB.distance_matrix(seq_lst, substitution_mat)
    except PB.InvalidBlockError as e:
        sys.exit('Unexpected PB in the input ({})'.format(e.block))
    distance_fname = options.o + ".PB.dist"
    write_distance_matrix(distance_mat, distance_fname)
    print("wrote {0}".format(distance_fname))

    # Carry out the clustering
    try:
        cluster_id, medoid_id = PB.hclust(distance_mat,
                                          nclusters=options.clusters)
    except PB.RError as e:
        sys.exit('Error with R:\n' + str(e))
    display_clust_report(cluster_id)
    output_fname = options.o + ".PB.clust"
    write_clusters(output_fname, cluster_id, medoid_id, header_lst)
    print("wrote {0}".format(output_fname))
Example #5
0
def pbclust_cli():
    """
    Run the PBclust command line
    """
    # Read user inputs
    options = user_input()
    header_lst, seq_lst = PB.read_several_fasta(options.f)

    # Load subtitution matrix
    try:
        substitution_mat = PB.load_substitution_matrix(PB.SUBSTITUTION_MATRIX_NAME)
    except ValueError:
        sys.exit("Substitution matrix is not symetric.")
    except IOError:
        sys.exit("Error reading the substitution matrix.")

    # --compare option
    # compare the first sequence (in the fasta file) versus all others
    if options.compare:
        compare_file_name = options.o + ".PB.compare.fasta"
        compare(header_lst, seq_lst, substitution_mat, compare_file_name)
        sys.exit(0)

    # Compute the distance matrix for the clustering
    try:
        distance_mat = PB.distance_matrix(seq_lst, substitution_mat)
    except PB.InvalidBlockError as e:
        sys.exit('Unexpected PB in the input ({})'.format(e.block))
    distance_fname = options.o + ".PB.dist"
    write_distance_matrix(distance_mat, distance_fname)
    print("wrote {0}".format(distance_fname))

    # Carry out the clustering
    try:
        cluster_id, medoid_id = PB.hclust(distance_mat, nclusters=options.clusters)
    except PB.RError as e:
        sys.exit('Error with R:\n' + str(e))
    display_clust_report(cluster_id)
    output_fname = options.o + ".PB.clust"
    write_clusters(output_fname, cluster_id, medoid_id, header_lst)
    print("wrote {0}".format(output_fname))
Example #6
0
def pbassign_cli():
    """
    PBassign command line.
    """
    options, pdb_name_lst = user_inputs()

    if options.p:
        if pdb_name_lst:
            print("{} PDB file(s) to process".format(len(pdb_name_lst)))
        else:
            print('Nothing to do. Good bye.')
            return
        # PB assignement of PDB structures
        chains = PDB.chains_from_files(pdb_name_lst)
    else:
        # PB assignement of a Gromacs trajectory
        chains = PDB.chains_from_trajectory(options.x, options.g)

    all_comments = []
    all_sequences = []
    all_dihedrals = []
    for comment, chain in chains:
        dihedrals = chain.get_phi_psi_angles()
        sequence = PB.assign(dihedrals)
        all_comments.append(comment)
        all_dihedrals.append(dihedrals)
        all_sequences.append(sequence)

    fasta_name = options.o + ".PB.fasta"
    with open(fasta_name, 'w') as outfile:
        PB.write_fasta(outfile, all_sequences, all_comments)
    if options.flat:
        flat_name = options.o + ".PB.flat"
        with open(flat_name, 'w') as outfile:
            PB.write_flat(outfile, all_sequences)
    if options.phipsi:
        phipsi_name = options.o + ".PB.phipsi"
        with open(phipsi_name, 'w') as outfile:
            PB.write_phipsi(outfile, all_dihedrals, all_comments)

    print("wrote {0}".format(fasta_name))
    if options.flat:
        print("wrote {0}".format(flat_name))
    if options.phipsi:
        print("wrote {0}".format(phipsi_name))
Example #7
0
def pbassign_cli():
    """
    PBassign command line.
    """
    options, pdb_name_lst = user_inputs()

    if options.p:
        if pdb_name_lst:
            print("{} PDB file(s) to process".format(len(pdb_name_lst)))
        else:
            print('Nothing to do. Good bye.')
            return
        # PB assignement of PDB structures
        chains = PDB.chains_from_files(pdb_name_lst)
    else:
        # PB assignement of a Gromacs trajectory
        chains = PDB.chains_from_trajectory(options.x, options.g)

    all_comments = []
    all_sequences = []
    all_dihedrals = []
    for comment, chain in chains:
        dihedrals = chain.get_phi_psi_angles()
        sequence = PB.assign(dihedrals)
        all_comments.append(comment)
        all_dihedrals.append(dihedrals)
        all_sequences.append(sequence)

    fasta_name = options.o + ".PB.fasta"
    with open(fasta_name, 'w') as outfile:
        PB.write_fasta(outfile, all_sequences, all_comments)
    if options.flat:
        flat_name = options.o + ".PB.flat"
        with open(flat_name, 'w') as outfile:
            PB.write_flat(outfile, all_sequences)
    if options.phipsi:
        phipsi_name = options.o + ".PB.phipsi"
        with open(phipsi_name, 'w') as outfile:
            PB.write_phipsi(outfile, all_dihedrals, all_comments)

    print("wrote {0}".format(fasta_name))
    if options.flat:
        print("wrote {0}".format(flat_name))
    if options.phipsi:
        print("wrote {0}".format(phipsi_name))
Example #8
0
 def test_count_to_transfac(self):
     """
     Test if the count_to_transfac function works.
     """
     ref_input = [
         "         a     b     c     d\n",  # header
         "1        0     0     0     0\n",
         "2        2   789 99999    89\n",  # one value is written
         # on 5 characters
         "3    99999  8888     2     2\n",  # the first value is
         # written on 5 characters
         "4       99     0 999999     0\n",  # one value is written
         # on more than 5
         # characters
         "5        0     0     0     0\n",
     ]
     identifier = "identifier"
     ref_output = (
         "ID identifier\n"
         "BF unknown\n"
         "P0       a     b     c     d\n"
         "00001     0     0     0     0    X\n"
         "00002     2   789 99999    89    X\n"
         "00003 99999  8888     2     2    X\n"
         "00004    99     0 999999     0    X\n"
         "00005     0     0     0     0    X\n"
         "XX\n"
         "//"
     )
     output = PB.count_to_transfac(identifier, ref_input)
     ref_output_lines = ref_output.split("\n")
     output_lines = output.split("\n")
     self.assertEqual(len(ref_output_lines), len(output_lines), "Not the right number of lines")
     for ref_line, line in zip(ref_output_lines, output_lines):
         print("ref:", ref_line)
         print("out:", line)
         self.assertEqual(ref_line, line)
Example #9
0
# G. E. Crooks, G. Hon, J.-M. Chandonia, and S. E. Brenner.
# 'WebLogo: A Sequence Logo Generator.'
# Genome Research 14:1188–90 (2004)
# doi:10.1101/gr.849004.
# http://weblogo.threeplusone.com/
#-------------------------------------------------------------------------------
if options.logo:
    # read count file
    #-------------------------------------------------------------------------------
    f_in = open(options.f, 'r')
    count_content = f_in.readlines()
    f_in.close()

    # convert a table of PB frequencies into transfac format as required by weblogo
    # http://meme.sdsc.edu/meme/doc/transfac-format.html
    transfac_content = PB.count_to_transfac(options.f, count_content)

    # write transfac file (debug only)
    #-------------------------------------------------------------------------------
    debug = False
    if debug:
        transfac_name = options.o + ".PB.transfac"
        f_out = open(transfac_name, 'w')
        f_out.write(transfac_content)
        f_out.close()
        print("wrote {0}".format(transfac_name))

    # define output file name
    #-------------------------------------------------------------------------------
    logo_file_name = options.o + ".PB.logo.pdf"
    if options.residue_min or options.residue_max:
Example #10
0
else:
    index_first_frame = options.first_frame-1

#-------------------------------------------------------------------------------
# check input files
#-------------------------------------------------------------------------------
for name in options.f:
    if not os.path.isfile(name):
        sys.exit("%s does not appear to be a valid file.\nBye" % name)
    
#-------------------------------------------------------------------------------
# read PBs files
#-------------------------------------------------------------------------------
pb_seq = []
for name in options.f:
    pb_seq += PB.read_fasta(name)[1][index_first_frame:]

#-------------------------------------------------------------------------------
# check all sequences have the same size
#-------------------------------------------------------------------------------
pb_seq_size = len(pb_seq[0])
for seq in pb_seq:
    if len(seq) != pb_seq_size:
        sys.exit("cannot compute PB frequencies / different sequence lengths")

#-------------------------------------------------------------------------------
# count PBs at each position of the sequence
#-------------------------------------------------------------------------------
pb_count = numpy.zeros((pb_seq_size, len(PB.NAMES)))

for seq in pb_seq:
Example #11
0
# G. E. Crooks, G. Hon, J.-M. Chandonia, and S. E. Brenner.
# 'WebLogo: A Sequence Logo Generator.'
# Genome Research 14:1188–90 (2004)
# doi:10.1101/gr.849004.
# http://weblogo.threeplusone.com/
# -------------------------------------------------------------------------------
if options.logo:
    # read count file
    # -------------------------------------------------------------------------------
    f_in = open(options.f, "r")
    count_content = f_in.readlines()
    f_in.close()

    # convert a table of PB frequencies into transfac format as required by weblogo
    # http://meme.sdsc.edu/meme/doc/transfac-format.html
    transfac_content = PB.count_to_transfac(options.f, count_content)

    # write transfac file (debug only)
    # -------------------------------------------------------------------------------
    debug = False
    if debug:
        transfac_name = options.o + ".PB.transfac"
        f_out = open(transfac_name, "w")
        f_out.write(transfac_content)
        f_out.close()
        print("wrote {0}".format(transfac_name))

    # define output file name
    # -------------------------------------------------------------------------------
    logo_file_name = options.o + ".PB.logo.pdf"
    if options.residue_min or options.residue_max: