def build(seq_path, output):
    generic.quit_if_missing(seq_path)
    generic.warn_if_exist(output)
    if os.path.isfile(output):
        os.remove(output)
        # shutil.move(output, paths.TRASH)
    command = f"{paths.FASTA_2_MARKOV_EXEC} -protein < {seq_path} > {output}"
    subprocess.run(command, shell=True)
    generic.quit_if_missing(output)

    # with open(seq_path, 'r') as file:
    #     lines = file.readlines()
    # counter_obj = Counter()
    # for line in lines:
    #     if line.startswith(">"):
    #         pass
    #     counter_obj.update(line.strip())
    # total_count = sum(counter_obj.values())
    # alphabets = sorted(generic.AA3_to_AA1.values())
    # print(counter_obj)
    # with open(output, 'w') as file:
    #     for letter in alphabets:
    #         percentage = "%.5f" % (counter_obj[letter] / total_count)
    #         file.write(percentage + " ")


# from tests.src import paths_test
# build(paths_test.UNIPROT_SEQ, "./composition.txt")
Esempio n. 2
0
def main(input_path, output_path):
    """
    Input: motif_pos_pkl
    output: descr_file.pkl
    """

    # todo: if pdb_file is empty (0 bytes), for some reason load_pdb_data
    #  does not throw exception
    logs.set_logging_level()

    # paths
    store_dir = os.path.join(paths.ROOT, 'data', 'store')

    if os.path.isdir(store_dir):
        logging.warning("Store dir exists, deleting.")
        shutil.rmtree(store_dir)
    os.mkdir(store_dir)

    motif_pos_path = input_path
    with open(motif_pos_path, 'rb') as file:
        motif_pos_map = pickle.load(file)
    timecheck = time()
    descrs = descr_main.calculate(motif_pos_map)
    print(f"Time taken: {time() - timecheck}")
    logging.debug(f"Time taken: {time() - timecheck}")
    # for __, descr in descrs.groupby(['filename', 'cid', 'seq_marker']):
    #     calc_descr.write_descr(descr)

    generic.warn_if_exist(paths.OUTPUT_DESCRS)
    # Switching back to pkl to avoid false float comparison failures.
    # with open(os.path.join(paths.ROOT, "final_descr_output_orig.pkl"),
    import numpy as np
    with open(output_path, "wb") as file:
        pickle.dump(descrs, file, -1)
Esempio n. 3
0
def find_motifs_mast(pname_cid_path,
                     seq_file,
                     ref_meme_txt,
                     motif_len,
                     output,
                     meme_folder=paths.MEME_MAST_FOLDER):
    """
    :param pname_cid_path: paths.PNAME_CID
    :param seq_file: paths.FULL_SEQS
    :param ref_meme_txt: paths.REF_MEME_TXT
    :param motif_len: 13
    :param output: paths.MOTIF_POS
    """
    assert motif_len >= 1
    assert isinstance(motif_len, int)
    generic.quit_if_missing(pname_cid_path)
    with open(pname_cid_path, 'rb') as file:
        pname_cid_map = pickle.load(file)
    _test_seq_cid_map(pname_cid_map)
    motif_pos = motif_finder.find_mast(pname_cid_map,
                                       seq_file,
                                       ref_meme_txt,
                                       motif_len,
                                       meme_folder=meme_folder)
    generic.warn_if_exist(output)
    with open(output, 'wb') as file:
        pickle.dump(motif_pos, file, -1)
Esempio n. 4
0
    def setUp(self):
        self.meme_full = paths_test.ORIG_MEME_FOR_CONVERT
        self.meme_minimal = paths_test.REF_MEME_FROM_CONV
        self.ref_matrix = paths_test.REF_CONV_MATRIX
        self.ref_composition = paths_test.REF_CONV_COMPOSITION

        self.tmp_1 = paths_test.TMP_FILE_TEMPLATE.format(1)
        self.tmp_2 = paths_test.TMP_FILE_TEMPLATE.format(2)

        generic.warn_if_exist(self.tmp_1)
        generic.warn_if_exist(self.tmp_2)
Esempio n. 5
0
def _write_matrix_file(matrix_ordered, output):
    """
    For meme_suite matrix2meme
    """
    output_lines = []
    for AA_counts in matrix_ordered:
        output_lines.append(" ".join(str(i) for i in AA_counts))
    single_str_line = "\n".join(output_lines)
    generic.warn_if_exist(output)
    with open(output, 'w') as file:
        file.write(single_str_line)
Esempio n. 6
0
def parse_extract_ioncom(input_file, pname_cid_path):
    """
    :param input_file: paths.IONCOM_EXTRACT
    :param pname_cid_path: paths.PNAME_CID
    """
    generic.quit_if_missing(input_file)
    pname_cid_map = extract_parser.parse_ioncom(input_file)
    _test_seq_cid_map(pname_cid_map)
    generic.warn_if_exist(pname_cid_path)
    with open(pname_cid_path, 'wb') as file:
        pickle.dump(pname_cid_map, file, -1)
Esempio n. 7
0
def run_mast(meme_txt, fasta_filename, mast_output, mast_exec=paths.MAST_EXEC):
    generic.quit_if_missing(meme_txt)
    generic.quit_if_missing(fasta_filename)
    generic.warn_if_exist(mast_output, filetype='folder')
    command = f"{mast_exec} -oc {mast_output} -mt 0.0001 {meme_txt} " \
        f"{fasta_filename}"
    return_code = subprocess.run(command, shell=True).returncode
    if return_code != 0:
        logging.error("run_mast() failed.")
        logging.error(f"Command: <{command}>")
        raise Exception
def download_no_convert(acc_list, output):
    generic.warn_if_exist(output)
    with open(output, 'w') as file:
        for acc in acc_list:
            url = f"https://www.uniprot.org/uniprot/{acc}.fasta"
            try:
                with contextlib.closing(request.urlopen(url)) as contents:
                    output = contents.read().decode("utf-8")
                    file.write(output)
            except:
                continue
Esempio n. 9
0
def parse_extract_prosite(input_file, pname_cid_path):
    """
    :param input_file: paths.PROSITE_EXTRACT
    :param pname_cid_path: paths.PNAME_CID
    """
    generic.quit_if_missing(input_file)
    pname_cid_map = extract_parser.parse_prosite(input_file,
                                                 prosite_pdb_list.pdb_list)
    _test_seq_cid_map(pname_cid_map)
    generic.warn_if_exist(pname_cid_path)
    with open(pname_cid_path, 'wb') as file:
        pickle.dump(pname_cid_map, file, -1)
def download(pdb_list, output):
    pdb_acc_map = uniprot_id_converter.convert("PDB_ID", "ACC", pdb_list)
    generic.warn_if_exist(output)
    with open(output, 'w') as file:
        acc_unique = set(pdb_acc_map.values())
        for acc in acc_unique:
            url = f"https://www.uniprot.org/uniprot/{acc}.fasta"
            try:
                with contextlib.closing(request.urlopen(url)) as contents:
                    output = contents.read().decode("utf-8")
                    file.write(output)
            except:
                continue
Esempio n. 11
0
def run_meme_single(fasta_filename,
                    motif_len,
                    meme_output,
                    num_p=1,
                    meme_exec=paths.MEME_EXEC):
    assert motif_len >= 1
    assert isinstance(motif_len, int)
    generic.quit_if_missing(fasta_filename)
    generic.warn_if_exist(meme_output, filetype='folder')
    command = f"{meme_exec} -w {motif_len} -p {num_p} -protein -nmotifs 1 " \
              f"-mod oops -oc {meme_output} {fasta_filename}"
    return_code = subprocess.run(command, shell=True).returncode
    if return_code != 0:
        logging.error("run_meme_single() failed.")
        logging.error(f"Command: <{command}>")
        raise Exception
Esempio n. 12
0
def run_prosite_mast(extract_path,
                     motif_len,
                     ref_meme_txt,
                     output,
                     pdb_folder=paths.PDB_FOLDER,
                     storage_path=None):
    """
    :param extract_path: paths.IONCOM_EXTRACT
    :param motif_len: 13
    :param ref_meme_txt: paths.REF_MEME_TXT
    :param output: paths.PID_PDB_MAP
    :param pdb_folder:
    :param storage_path:
    :return:
    """
    generic.quit_if_missing(extract_path)
    generic.quit_if_missing(ref_meme_txt)
    generic.warn_if_exist(output)
    if storage_path is None:
        pname_cid_path = paths.PNAME_CID
        seq_path = paths.FULL_SEQS
        meme_folder = paths.MEME_MAST_FOLDER
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl')
        seq_path = os.path.join(storage_path, 'seqs.fasta')
        meme_folder = os.path.join(storage_path, 'meme_folder')

    parse_extract_prosite(extract_path, pname_cid_path)
    download_pdb(pname_cid_path, pdb_folder)
    trim_pnames_based_on_pdb(pname_cid_path, pdb_folder)
    create_seq(pname_cid_path, seq_path, pdb_folder)
    filter_seq_file(seq_path, threshold=31)
    find_motifs_mast(pname_cid_path, seq_path, ref_meme_txt, motif_len, output,
                     meme_folder)

    if storage_path is None:
        shutil.move(pname_cid_path, paths.TRASH)
        shutil.move(seq_path, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
Esempio n. 13
0
def run_prosite_meme(extract_path,
                     motif_len,
                     output,
                     num_p=7,
                     pdb_folder=paths.PDB_FOLDER,
                     storage_path=None):
    """
    :param extract_path: paths.PROSITE_EXTRACT
    :param motif_len: 13
    :param output: paths.PID_PDB_MAP
    """ ''
    generic.quit_if_missing(extract_path)
    generic.warn_if_exist(output)
    assert isinstance(num_p, int)
    assert num_p >= 1

    if storage_path is None:
        pname_cid_path = paths.PNAME_CID
        seq_path = paths.FULL_SEQS
        meme_folder = paths.MEME_MAST_FOLDER
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl')
        seq_path = os.path.join(storage_path, 'seqs.fasta')
        meme_folder = os.path.join(storage_path, 'meme_folder')

    parse_extract_prosite(extract_path, pname_cid_path)
    download_pdb(pname_cid_path, pdb_folder)
    trim_pnames_based_on_pdb(pname_cid_path, pdb_folder)
    create_seq(pname_cid_path, seq_path, pdb_folder)

    filter_seq_file(seq_path, threshold=31)

    find_motifs_meme(pname_cid_path, seq_path, motif_len, output, meme_folder,
                     num_p)
    if storage_path is None:
        shutil.move(pname_cid_path, paths.TRASH)
        shutil.move(seq_path, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
Esempio n. 14
0
def build_descr(input_path, output_path):
    logs.set_logging_level()

    store_dir = os.path.join(paths.ROOT, 'data', 'store')

    if os.path.isdir(store_dir):
        logging.warning("Store dir exists, deleting.")
        shutil.rmtree(store_dir)
    os.mkdir(store_dir)

    motif_pos_path = input_path
    with open(motif_pos_path, 'rb') as file:
        motif_pos_map = pickle.load(file)
    timecheck = time()
    descrs = descr_main.calculate(motif_pos_map)
    print(f"Time taken: {time() - timecheck}")
    logging.debug(f"Time taken: {time() - timecheck}")

    generic.warn_if_exist(paths.OUTPUT_DESCRS)
    # Switching back to pkl to avoid false float comparison failures.
    # with open(os.path.join(paths.ROOT, "final_descr_output_orig.pkl"),

    with open(output_path, "wb") as file:
        pickle.dump(descrs, file, -1)
Esempio n. 15
0
 def setUp(self):
     self.matrix = paths_test.REF_CONV_MATRIX
     self.composition = paths_test.REF_CONV_COMPOSITION
     self.ref_meme = paths_test.REF_MEME_FROM_CONV
     self.tmp_1 = paths_test.TMP_FILE_TEMPLATE.format(1)
     generic.warn_if_exist(self.tmp_1)
Esempio n. 16
0
def run_prosite_aligned(seq_file, aligned_seq_file, output, storage_path=None):
    """
    We start off with a seq_file, aligned_seq_file.

    We derive matrix from aligned_seq_file
    We derive composition from seq_file
    We put both together into a meme.txt

    We take seq_file, and extract from it the relevant .pdb files and the
    corresponding cid. So acc+seq => pdb+cid

    We therefore select the seq files for which a corresponding .pdb+cid
    exists, and output it into a cropped_seqfile

    We run mast on this cropped_seqfile, to obtain motif_pos.

    Finally we output this motif_pos using the pdb+cid from before.

    Desired intermediate files:
    meme.txt from aligned_seq
    acc=>pdb+cid map
    cropped_seqfile.fasta
    acc=>motif_pos
    pdb+cid=>motif_pos
    """
    generic.quit_if_missing(seq_file)
    generic.quit_if_missing(aligned_seq_file)
    generic.warn_if_exist(output)
    if storage_path is None:
        composition_file = paths.TMP_FILE_TEMPLATE.format('composition.txt')
        meme_txt = paths.TMP_FILE_TEMPLATE.format('meme_from_aligned.txt')
        meme_folder = paths.MEME_MAST_FOLDER
        cropped_seq_file = paths.TMP_FILE_TEMPLATE.format('cropped_seqs.fasta')
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        composition_file = os.path.join(storage_path, 'composition.txt')
        meme_txt = os.path.join(storage_path, 'meme_from_aligned.txt')
        meme_folder = os.path.join(storage_path, 'meme_mast_folder')
        cropped_seq_file = os.path.join(storage_path, 'cropped_seqs.fasta')
    if os.path.isfile(composition_file):
        os.remove(composition_file)
    build_composition.build(seq_file, composition_file)
    build_meme_from_aligned.build(aligned_seq_file, meme_txt, composition_file)

    acc_seq_map = get_pname_seq.parse_raw(seq_file)
    acc_ids = list(acc_seq_map.keys())
    acc_pdb_map = uniprot_id_converter.convert("ACC", "PDB_ID", acc_ids)

    pdb_seq_map = dict()
    # because pdb => acc mapping may not be 1-1, we retain the original maps
    mapped_pdb_acc = dict()
    for acc_id, seq in acc_seq_map.items():
        if acc_id in acc_pdb_map:
            pdb_id = acc_pdb_map[acc_id]
            mapped_pdb_acc[pdb_id] = acc_id
            pdb_seq_map[pdb_id] = seq

    pdb_cid_map = find_cid_from_pname.find(pdb_seq_map)
    acc_pdb_cid_map = {
        mapped_pdb_acc[pdb]: (pdb, cid)
        for pdb, cid in pdb_cid_map.items()
    }
    cropped_acc_list = list(acc_pdb_cid_map.keys())

    keep_only_acc(cropped_acc_list, seq_file, cropped_seq_file)
    meme_interface.run_mast(meme_txt, cropped_seq_file, meme_folder)
    mast_txt_path = os.path.join(meme_folder, 'mast.txt')
    acc_motif_map = meme_interface.extract_motifs_mast_uniprot(
        mast_txt_path, 14)
    acc_motif_map = motif_finder._delete_gapped_motifs_uniprot(
        acc_motif_map, cropped_seq_file)
    pdb_motif_pos = defaultdict(dict)
    for acc, motif_pos in acc_motif_map.items():
        pdb_id, cid = acc_pdb_cid_map[acc]
        pdb_motif_pos[pdb_id]['sno_markers'] = motif_pos
        pdb_motif_pos[pdb_id]['cid'] = cid
    with open(output, 'wb') as file:
        pickle.dump(pdb_motif_pos, file, -1)
    if storage_path is None:
        shutil.move(composition_file, paths.TRASH)
        shutil.move(meme_txt, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
        shutil.move(cropped_seq_file, paths.TRASH)
    return