def parse(seq_path):
    """
    Figures out the sequence filetype automatically. Only distinguishes
    between uniprot and uniref now.
    """
    generic.quit_if_missing(seq_path)
    is_uniref = False
    is_valid = False
    with open(seq_path) as file:
        for line in file:
            if not line.startswith(">"):
                continue
            is_valid = True
            if line[1:].startswith("UniRef"):
                is_uniref = True
                break
            break
    if not is_valid:
        logging.error(f"Input Seq-file in {seq_path} is invalid, no header "
                      f"lines with > found.")
        raise Exception
    if is_uniref:
        pdb_seq_map = parse_uniref(seq_path)
    else:
        pdb_seq_map = parse_uniprot(seq_path)
    return pdb_seq_map
Esempio n. 2
0
def find_motifs_mast(pname_cid_path,
                     seq_file,
                     ref_meme_txt,
                     motif_len,
                     output,
                     meme_folder=paths.MEME_MAST_FOLDER):
    """
    :param pname_cid_path: paths.PNAME_CID
    :param seq_file: paths.FULL_SEQS
    :param ref_meme_txt: paths.REF_MEME_TXT
    :param motif_len: 13
    :param output: paths.MOTIF_POS
    """
    assert motif_len >= 1
    assert isinstance(motif_len, int)
    generic.quit_if_missing(pname_cid_path)
    with open(pname_cid_path, 'rb') as file:
        pname_cid_map = pickle.load(file)
    _test_seq_cid_map(pname_cid_map)
    motif_pos = motif_finder.find_mast(pname_cid_map,
                                       seq_file,
                                       ref_meme_txt,
                                       motif_len,
                                       meme_folder=meme_folder)
    generic.warn_if_exist(output)
    with open(output, 'wb') as file:
        pickle.dump(motif_pos, file, -1)
def build(seq_path, output):
    generic.quit_if_missing(seq_path)
    generic.warn_if_exist(output)
    if os.path.isfile(output):
        os.remove(output)
        # shutil.move(output, paths.TRASH)
    command = f"{paths.FASTA_2_MARKOV_EXEC} -protein < {seq_path} > {output}"
    subprocess.run(command, shell=True)
    generic.quit_if_missing(output)

    # with open(seq_path, 'r') as file:
    #     lines = file.readlines()
    # counter_obj = Counter()
    # for line in lines:
    #     if line.startswith(">"):
    #         pass
    #     counter_obj.update(line.strip())
    # total_count = sum(counter_obj.values())
    # alphabets = sorted(generic.AA3_to_AA1.values())
    # print(counter_obj)
    # with open(output, 'w') as file:
    #     for letter in alphabets:
    #         percentage = "%.5f" % (counter_obj[letter] / total_count)
    #         file.write(percentage + " ")


# from tests.src import paths_test
# build(paths_test.UNIPROT_SEQ, "./composition.txt")
Esempio n. 4
0
def parse_extract_ioncom(input_file, pname_cid_path):
    """
    :param input_file: paths.IONCOM_EXTRACT
    :param pname_cid_path: paths.PNAME_CID
    """
    generic.quit_if_missing(input_file)
    pname_cid_map = extract_parser.parse_ioncom(input_file)
    _test_seq_cid_map(pname_cid_map)
    generic.warn_if_exist(pname_cid_path)
    with open(pname_cid_path, 'wb') as file:
        pickle.dump(pname_cid_map, file, -1)
Esempio n. 5
0
def run_mast(meme_txt, fasta_filename, mast_output, mast_exec=paths.MAST_EXEC):
    generic.quit_if_missing(meme_txt)
    generic.quit_if_missing(fasta_filename)
    generic.warn_if_exist(mast_output, filetype='folder')
    command = f"{mast_exec} -oc {mast_output} -mt 0.0001 {meme_txt} " \
        f"{fasta_filename}"
    return_code = subprocess.run(command, shell=True).returncode
    if return_code != 0:
        logging.error("run_mast() failed.")
        logging.error(f"Command: <{command}>")
        raise Exception
Esempio n. 6
0
def trim_pnames_based_on_pdb(pname_cid_path, pdb_folder=paths.PDB_FOLDER):
    """
    :param pname_cid_path: paths.PNAME_CID
    """
    with open(pname_cid_path, 'rb') as file:
        pname_cid_map = pickle.load(file)
    _test_seq_cid_map(pname_cid_map)
    download_pdb_files.trim_pname_cid(pname_cid_map, pdb_folder)
    _test_seq_cid_map(pname_cid_map)
    with open(pname_cid_path, 'wb') as file:
        pickle.dump(pname_cid_map, file, -1)
    generic.quit_if_missing(pname_cid_path)
Esempio n. 7
0
def parse_extract_prosite(input_file, pname_cid_path):
    """
    :param input_file: paths.PROSITE_EXTRACT
    :param pname_cid_path: paths.PNAME_CID
    """
    generic.quit_if_missing(input_file)
    pname_cid_map = extract_parser.parse_prosite(input_file,
                                                 prosite_pdb_list.pdb_list)
    _test_seq_cid_map(pname_cid_map)
    generic.warn_if_exist(pname_cid_path)
    with open(pname_cid_path, 'wb') as file:
        pickle.dump(pname_cid_map, file, -1)
Esempio n. 8
0
def download_pdb(pname_cid_path, pdb_folder=paths.PDB_FOLDER):
    """
    :param pname_cid_path: paths.PNAME_CID
    """
    generic.quit_if_missing(pname_cid_path)
    with open(pname_cid_path, 'rb') as file:
        pname_cid_map = pickle.load(file)
    _test_seq_cid_map(pname_cid_map)
    if not os.path.isdir(pdb_folder):
        logging.warning(f"PDB_folder in <{pdb_folder}> not found. Downloading "
                        f"from scratch takes a while.")
        os.mkdir(pdb_folder)
    download_pdb_files.download(pname_cid_map, pdb_folder)
Esempio n. 9
0
def create_seq(pname_cid_path, seq_path, pdb_folder=paths.PDB_FOLDER):
    """
    :param pname_cid_path: paths.PNAME_CID
    :param seq_path: paths.FULL_SEQS
    """
    generic.quit_if_missing(pname_cid_path)
    with open(pname_cid_path, 'rb') as file:
        pname_cid_map = pickle.load(file)
    _test_seq_cid_map(pname_cid_map)
    seqs = create_seq_file.extract_sequence(pdb_folder,
                                            pname_cid_map,
                                            AA3_to_AA1=generic.AA3_to_AA1)
    with open(seq_path, 'w') as file:
        file.writelines(seqs)
    create_seq_file.test_fasta_match_pdb(seq_path, pdb_folder, pname_cid_map,
                                         generic.AA3_to_AA1)
Esempio n. 10
0
def run_meme_single(fasta_filename,
                    motif_len,
                    meme_output,
                    num_p=1,
                    meme_exec=paths.MEME_EXEC):
    assert motif_len >= 1
    assert isinstance(motif_len, int)
    generic.quit_if_missing(fasta_filename)
    generic.warn_if_exist(meme_output, filetype='folder')
    command = f"{meme_exec} -w {motif_len} -p {num_p} -protein -nmotifs 1 " \
              f"-mod oops -oc {meme_output} {fasta_filename}"
    return_code = subprocess.run(command, shell=True).returncode
    if return_code != 0:
        logging.error("run_meme_single() failed.")
        logging.error(f"Command: <{command}>")
        raise Exception
def _get_id_seq_map(seq_path, line_extractor):
    generic.quit_if_missing(seq_path)
    id_seq_map = dict()
    with open(seq_path, 'r') as file:
        current_seq = []
        for line in file:
            if line.startswith(">"):
                desired_id = line_extractor(line)
                if current_seq:
                    seq = "".join(current_seq)
                    id_seq_map[desired_id] = seq
                    current_seq = []
            else:
                current_seq.append(line.strip())
    if current_seq:
        id_seq_map[desired_id] = seq
    return id_seq_map
def parse_raw(seq_path):
    """
    Only uniprot
    """
    generic.quit_if_missing(seq_path)
    id_seq_map = dict()
    with open(seq_path, 'r') as file:
        current_seq = []
        for line in file:
            if line.startswith(">"):
                if current_seq:
                    seq = "".join(current_seq)
                    id_seq_map[desired_id] = seq
                    current_seq = []
                desired_id = line.strip().split("|")[1]
            else:
                current_seq.append(line.strip())
    if current_seq:
        id_seq_map[desired_id] = seq
    return id_seq_map
    pass
Esempio n. 13
0
def run_prosite_meme(extract_path,
                     motif_len,
                     output,
                     num_p=7,
                     pdb_folder=paths.PDB_FOLDER,
                     storage_path=None):
    """
    :param extract_path: paths.PROSITE_EXTRACT
    :param motif_len: 13
    :param output: paths.PID_PDB_MAP
    """ ''
    generic.quit_if_missing(extract_path)
    generic.warn_if_exist(output)
    assert isinstance(num_p, int)
    assert num_p >= 1

    if storage_path is None:
        pname_cid_path = paths.PNAME_CID
        seq_path = paths.FULL_SEQS
        meme_folder = paths.MEME_MAST_FOLDER
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl')
        seq_path = os.path.join(storage_path, 'seqs.fasta')
        meme_folder = os.path.join(storage_path, 'meme_folder')

    parse_extract_prosite(extract_path, pname_cid_path)
    download_pdb(pname_cid_path, pdb_folder)
    trim_pnames_based_on_pdb(pname_cid_path, pdb_folder)
    create_seq(pname_cid_path, seq_path, pdb_folder)

    filter_seq_file(seq_path, threshold=31)

    find_motifs_meme(pname_cid_path, seq_path, motif_len, output, meme_folder,
                     num_p)
    if storage_path is None:
        shutil.move(pname_cid_path, paths.TRASH)
        shutil.move(seq_path, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
Esempio n. 14
0
def run_prosite_mast(extract_path,
                     motif_len,
                     ref_meme_txt,
                     output,
                     pdb_folder=paths.PDB_FOLDER,
                     storage_path=None):
    """
    :param extract_path: paths.IONCOM_EXTRACT
    :param motif_len: 13
    :param ref_meme_txt: paths.REF_MEME_TXT
    :param output: paths.PID_PDB_MAP
    :param pdb_folder:
    :param storage_path:
    :return:
    """
    generic.quit_if_missing(extract_path)
    generic.quit_if_missing(ref_meme_txt)
    generic.warn_if_exist(output)
    if storage_path is None:
        pname_cid_path = paths.PNAME_CID
        seq_path = paths.FULL_SEQS
        meme_folder = paths.MEME_MAST_FOLDER
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl')
        seq_path = os.path.join(storage_path, 'seqs.fasta')
        meme_folder = os.path.join(storage_path, 'meme_folder')

    parse_extract_prosite(extract_path, pname_cid_path)
    download_pdb(pname_cid_path, pdb_folder)
    trim_pnames_based_on_pdb(pname_cid_path, pdb_folder)
    create_seq(pname_cid_path, seq_path, pdb_folder)
    filter_seq_file(seq_path, threshold=31)
    find_motifs_mast(pname_cid_path, seq_path, ref_meme_txt, motif_len, output,
                     meme_folder)

    if storage_path is None:
        shutil.move(pname_cid_path, paths.TRASH)
        shutil.move(seq_path, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
Esempio n. 15
0
def run_prosite_aligned(seq_file, aligned_seq_file, output, storage_path=None):
    """
    We start off with a seq_file, aligned_seq_file.

    We derive matrix from aligned_seq_file
    We derive composition from seq_file
    We put both together into a meme.txt

    We take seq_file, and extract from it the relevant .pdb files and the
    corresponding cid. So acc+seq => pdb+cid

    We therefore select the seq files for which a corresponding .pdb+cid
    exists, and output it into a cropped_seqfile

    We run mast on this cropped_seqfile, to obtain motif_pos.

    Finally we output this motif_pos using the pdb+cid from before.

    Desired intermediate files:
    meme.txt from aligned_seq
    acc=>pdb+cid map
    cropped_seqfile.fasta
    acc=>motif_pos
    pdb+cid=>motif_pos
    """
    generic.quit_if_missing(seq_file)
    generic.quit_if_missing(aligned_seq_file)
    generic.warn_if_exist(output)
    if storage_path is None:
        composition_file = paths.TMP_FILE_TEMPLATE.format('composition.txt')
        meme_txt = paths.TMP_FILE_TEMPLATE.format('meme_from_aligned.txt')
        meme_folder = paths.MEME_MAST_FOLDER
        cropped_seq_file = paths.TMP_FILE_TEMPLATE.format('cropped_seqs.fasta')
    else:
        generic.quit_if_missing(storage_path, filetype='folder')
        composition_file = os.path.join(storage_path, 'composition.txt')
        meme_txt = os.path.join(storage_path, 'meme_from_aligned.txt')
        meme_folder = os.path.join(storage_path, 'meme_mast_folder')
        cropped_seq_file = os.path.join(storage_path, 'cropped_seqs.fasta')
    if os.path.isfile(composition_file):
        os.remove(composition_file)
    build_composition.build(seq_file, composition_file)
    build_meme_from_aligned.build(aligned_seq_file, meme_txt, composition_file)

    acc_seq_map = get_pname_seq.parse_raw(seq_file)
    acc_ids = list(acc_seq_map.keys())
    acc_pdb_map = uniprot_id_converter.convert("ACC", "PDB_ID", acc_ids)

    pdb_seq_map = dict()
    # because pdb => acc mapping may not be 1-1, we retain the original maps
    mapped_pdb_acc = dict()
    for acc_id, seq in acc_seq_map.items():
        if acc_id in acc_pdb_map:
            pdb_id = acc_pdb_map[acc_id]
            mapped_pdb_acc[pdb_id] = acc_id
            pdb_seq_map[pdb_id] = seq

    pdb_cid_map = find_cid_from_pname.find(pdb_seq_map)
    acc_pdb_cid_map = {
        mapped_pdb_acc[pdb]: (pdb, cid)
        for pdb, cid in pdb_cid_map.items()
    }
    cropped_acc_list = list(acc_pdb_cid_map.keys())

    keep_only_acc(cropped_acc_list, seq_file, cropped_seq_file)
    meme_interface.run_mast(meme_txt, cropped_seq_file, meme_folder)
    mast_txt_path = os.path.join(meme_folder, 'mast.txt')
    acc_motif_map = meme_interface.extract_motifs_mast_uniprot(
        mast_txt_path, 14)
    acc_motif_map = motif_finder._delete_gapped_motifs_uniprot(
        acc_motif_map, cropped_seq_file)
    pdb_motif_pos = defaultdict(dict)
    for acc, motif_pos in acc_motif_map.items():
        pdb_id, cid = acc_pdb_cid_map[acc]
        pdb_motif_pos[pdb_id]['sno_markers'] = motif_pos
        pdb_motif_pos[pdb_id]['cid'] = cid
    with open(output, 'wb') as file:
        pickle.dump(pdb_motif_pos, file, -1)
    if storage_path is None:
        shutil.move(composition_file, paths.TRASH)
        shutil.move(meme_txt, paths.TRASH)
        shutil.move(meme_folder, paths.TRASH)
        shutil.move(cropped_seq_file, paths.TRASH)
    return
Esempio n. 16
0
def filter_seq_file(seq_path, threshold=50):
    """
    :param seq_path: paths.FULL_SEQS
    """
    generic.quit_if_missing(seq_path)
    filter_seqs.delete_short_seqs(seq_path, threshold)