Python split_multimol2の例、biopandas.mol2.split_multimol2 Pythonの例

コード例 #1

0

ファイルを表示

ファイル: enumerate_conformers.py プロジェクト: tahmidbintaslim/screenlamp

def read_and_write(inp_mol2_path, out_mol2_path, verbose):

    if verbose:
        sys.stdout.write('Processing %s' % os.path.basename(inp_mol2_path))
        sys.stdout.flush()
        start = time.time()

    if inp_mol2_path.endswith('.gz'):
        write_mode = 'wb'
        open_file = gzip.open
    else:
        write_mode = 'w'
        open_file = open
    """
    if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
    """

    with open_file(out_mol2_path, write_mode) as outfile:

        prev_molecule = ''

        if inp_mol2_path.endswith('.gz'):
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = b'%s_%d' % (id_, cnt)

                cont[1] = mol_idx + b'\n'
                outfile.write(b''.join(cont))
                prev_molecule = id_

        else:
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = '%s_%d' % (id_, cnt)

                cont[1] = mol_idx + '\n'
                outfile.write(''.join(cont))
                prev_molecule = id_

    if verbose:
        elapsed = time.time() - start
        n_molecules = i + 1
        sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                         (n_molecules, n_molecules / elapsed))
        sys.stdout.flush()

コード例 #2

0

ファイルを表示

def read_and_write(mol2_files, id_file_path, verbose, n_cpus):

    if verbose:
        sys.stdout.write('Using selection: %s\n' % SELECTION)
        sys.stdout.flush()

    with open(id_file_path, 'w') as f:

        for mol2_file in mol2_files:
            if verbose:
                start = time.time()
                sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
                sys.stdout.flush()

            cnt = 0

            if mol2_file.endswith('.gz'):
                data_processor_fn = data_processor_gz
            else:
                data_processor_fn = data_processor

            for chunk in lazy_imap(data_processor=data_processor_fn,
                                   data_generator=split_multimol2(mol2_file),
                                   n_cpus=n_cpus):
                _ = [f.write('%s\n' % mol2_id) for mol2_id in chunk if mol2_id]
                cnt += len(chunk)

            if verbose:
                elapsed = time.time() - start
                sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
                sys.stdout.flush()

コード例 #3

0

ファイルを表示

ファイル: process_raw_data.py プロジェクト: nurmister/tf-gnn-samples

def process_ligands(target):
    """Get information for all ligands associated with the target."""
    ligand_list = []
    num_atoms_target = len(protein_dict[target]["node_features"])
    for fname in ["actives_final.mol2", "decoys_final.mol2"]:
        response = int(fname.startswith("a"))
        # Split the mol2 file with multiple ligands by ligand.
        # This list will be a list of pair sub-lists, the first
        # element of which is the ligand code, and the second of which
        # is the associated coordinate and bond text.
        curr_info = list(split_multimol2(f"raw/{target}/{fname}"))
        curr_info = [[f"{target}_{entry[0]}"] +
                     process_ligand_text(entry[1], num_atoms_target) +
                     [[[response]]] for entry in curr_info]
        curr_info = [
            dict(zip(("id", "node_features", "graph", "targets"),
                     curr_info[i])) for i in range(len(curr_info))
        ]
        ligand_list += curr_info
    return ligand_list

コード例 #4

0

ファイルを表示

ファイル: ligands.py プロジェクト: zuzanna-mackiewicz/projekt

def ligands_reader():
    '''
    Parses selected MOL2 file with structures of previously docked ligands using BioPandas module.
    Lists all atoms from all ligands with their coordinates.

    :return: symbols, numbers and coordinates of atoms + number of atom
    :rtype: list of lists
    '''

    window = Tk()
    path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
    path = os.path.join(path, 'files')

    ligands_path_string = filedialog.askopenfilename(
        initialdir='path',
        title="SELECT LIGANDS STRUCTURE:",
        filetypes=(("MOL2 files", "*.mol2"), ("all files", "*.*")))
    ligands_name = os.path.basename(ligands_path_string)
    window.destroy()

    ligands_data = []
    model_number = 1
    with open(ligands_path_string, 'r') as ligands:
        for ligand in split_multimol2(ligands_path_string):
            pmol = PandasMol2().read_mol2_from_list(mol2_lines=ligand[1],
                                                    mol2_code=ligand[0])
            atom_coord = pmol.df[['atom_name', 'atom_id', 'x', 'y', 'z']]

            atom_coord = atom_coord.assign(column=model_number)
            model_number += 1

            model_data = atom_coord.values.tolist()
            ligands_data = ligands_data + model_data

    # print(ligands_data)
    return ligands_data

コード例 #5

0

ファイルを表示

def read_and_write(q_path, d_path, verbose, cache, output_file, n_cpus):

    dct_results = {'dbase': [], 'query': [], 'atoms': [], 'charges': []}

    d_base = os.path.basename(d_path)
    q_base = os.path.basename(q_path)

    if verbose:
        start = time.time()
        sys.stdout.write('Processing %s/%s' % (d_base, q_base))
        sys.stdout.flush()

    cnt = 0

    if q_path.endswith('.gz'):
        data_processor_fn = data_processor_gz
    else:
        data_processor_fn = data_processor

    for chunk in lazy_imap(data_processor=data_processor_fn,
                           data_generator=zip(split_multimol2(d_path),
                                              split_multimol2(q_path)),
                           n_cpus=n_cpus):

        for dbase_id, query_id, atoms, charges in chunk:
            dct_results['dbase'].append(dbase_id)
            dct_results['query'].append(query_id)
            dct_results['atoms'].append(atoms)
            dct_results['charges'].append(charges)

        cnt += len(chunk)
    """

    q_pdmol = PandasMol2()
    d_pdmol = PandasMol2()

    for q_mol2, d_mol2 in zip(split_multimol2(q_path),
                              split_multimol2(d_path)):
        cnt += 1
        d_pdmol.read_mol2_from_list(mol2_code=d_mol2[0],
                                    mol2_lines=d_mol2[1])
        d_pdmol._df = d_pdmol.df[(d_pdmol.df['atom_type'] != 'H')]

        if q_mol2[0] in cache:
            q_pdmol = cache[q_mol2[0]]

        else:
            q_pdmol.read_mol2_from_list(mol2_code=q_mol2[0],
                                        mol2_lines=q_mol2[1])
            q_pdmol._df = q_pdmol.df[(q_pdmol.df['atom_type'] != 'H')]
            cache[q_mol2[0]] = q_pdmol

        atoms, charges = get_atom_matches(q_pdmol, d_pdmol)

        dct_results['query'].append(q_mol2[0])
        dct_results['dbase'].append(d_mol2[0])
        dct_results['atoms'].append(atoms)
        dct_results['charges'].append(charges)
    """

    with open(output_file + '_charge.tsv', 'w') as f1,\
            open(output_file + '_atomtype.tsv', 'w') as f2:

        columns = PandasMol2().read_mol2(q_path).df['atom_name'].values
        f1.write('dbase\tquery\t%s\n' % '\t'.join(columns))
        f2.write('dbase\tquery\t%s\n' % '\t'.join(columns))
        for i in range(len(dct_results['dbase'])):
            s1 = '%s\t%s\t%s\n' % (dct_results['dbase'][i],
                                   dct_results['query'][i], '\t'.join(
                                       format(x, "1.2f")
                                       for x in dct_results['charges'][i]))

            f1.write(s1)
            s2 = '%s\t%s\t%s\n' % (dct_results['dbase'][i],
                                   dct_results['query'][i], '\t'.join(
                                       dct_results['atoms'][i]))
            f2.write(s2)

    if verbose:
        elapsed = time.time() - start
        n_molecules = cnt + 1
        sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                         (n_molecules, n_molecules / elapsed))
        sys.stdout.flush()

コード例 #6

0

ファイルを表示

def read_and_write(inp_mol2_path, report_path, output_dir, query_path, sortby,
                   separator, verbose, id_suffix, selection):

    if verbose:
        sys.stdout.write('Processing %s' % os.path.basename(inp_mol2_path))
        sys.stdout.flush()

    df = pd.read_table(report_path,
                       usecols=['Name', 'ShapeQuery'] + sortby,
                       sep=separator)

    if sortby:
        df.sort_values(sortby, inplace=True, ascending=False)

    if selection:
        selection_str = parse_selection_string(selection, df_name='df')
        mask = pd.eval(selection_str)
        df = df[mask]

    dbase_query_pairs = [
        (d, q) for d, q in zip(df['Name'].values, df['ShapeQuery'].values)
    ]
    query_names = {q for q in df['ShapeQuery'].values}

    query_mol2s = {}

    multiconf_query = False
    for idx, cont in enumerate(split_multimol2(query_path)):
        if idx >= 1:
            multiconf_query = True
            break

    cnt = -1

    if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
                    cont[1] = mol_idx + '\n'
                query_mol2s[mol_idx] = ''.join(cont)

    else:
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            if multiconf_query:
                mol_idx = '%s_%d' % (id_, cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
                    cont[1] = mol_idx + '\n'
                query_mol2s[mol_idx] = ''.join(cont)

    out_path_base = os.path.join(
        output_dir,
        os.path.basename(inp_mol2_path).split('.mol2')[0])
    out_path_q = '%s_%s' % (out_path_base, 'query.mol2')
    out_path_d = '%s_%s' % (out_path_base, 'dbase.mol2')

    with tempfile.TemporaryDirectory() as tmpdirname:
        for id_, cont in split_multimol2(inp_mol2_path):
            if id_:
                tmp_path = os.path.join(tmpdirname, id_)
                with open(tmp_path, 'wb') as f:
                    pickle.dump(''.join(cont), f)

        with open(out_path_d, 'w') as dof,\
                open(out_path_q, 'w') as qof:

            if verbose:
                start = time.time()

            cnt = 0
            for d, q in dbase_query_pairs:
                cnt += 1
                qof.write(query_mol2s[q])
                with open(os.path.join(tmpdirname, d), 'rb') as pkl:
                    pkl_cont = pickle.load(pkl)
                    dof.write(pkl_cont)

    if verbose:
        elapsed = time.time() - start
        n_molecules = cnt + 1
        sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                         (n_molecules, n_molecules / elapsed))
        sys.stdout.flush()

コード例 #7

0

ファイルを表示

def main(input_dir, output_dir, atomtype_selection, charge_selection,
         input_mol2, verbose):

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    all_tsv_base = [f for f in os.listdir(input_dir) if f.endswith('.tsv')]
    all_tsv_full = [os.path.join(input_dir, f) for f in all_tsv_base]
    a_inlist, c_inlist = get_tsv_pairs(all_tsv_full)
    a_outlist, c_outlist = get_tsv_pairs(all_tsv_base)
    a_outlist = [os.path.join(output_dir, f) for f in a_outlist]
    c_outlist = [os.path.join(output_dir, f) for f in c_outlist]

    for a_in, a_out, c_in, c_out in zip(a_inlist, a_outlist, c_inlist,
                                        c_outlist):

        if verbose:
            start = time.time()
            sys.stdout.write('Processing %s/%s' %
                             (os.path.basename(a_in), os.path.basename(c_in)))
            sys.stdout.flush()

        df_charge = pd.read_table(c_in, sep='\t')
        for c in df_charge.columns[2:]:
            df_charge[c] = pd.to_numeric(df_charge[c])
        df_atom = pd.read_table(a_in, sep='\t')
        mol2_cnt = df_atom.shape[0]

        if atomtype_selection:
            atom_sele = parse_selection_string(s=atomtype_selection,
                                               columns=df_atom.columns,
                                               df_name='df_atom')

            for sele in atom_sele:
                df_atom = pd.eval(sele)

        if charge_selection:
            charge_sele = parse_selection_string(s=charge_selection,
                                                 columns=df_charge.columns,
                                                 df_name='df_charge')

            for sele in charge_sele:
                df_charge = pd.eval(sele)

        selection_indices = set(df_charge.index).intersection(
            set(df_atom.index))
        selection_indices = sorted(list(selection_indices))

        df_atom.ix[selection_indices].to_csv(a_out, sep='\t')
        df_charge.ix[selection_indices].to_csv(c_out, sep='\t')

        if input_mol2:
            input_mol2_path_query = os.path.join(
                input_mol2,
                os.path.basename(c_out).replace('_charge.tsv', '_query.mol2'))
            input_mol2_path_dbase = input_mol2_path_query.replace(
                '_query.mol2', '_dbase.mol2')

            if not os.path.exists(input_mol2_path_query)\
                    and os.path.exists(input_mol2_path_query + '.gz'):
                input_mol2_path_query += '.gz'
            if not os.path.exists(input_mol2_path_dbase)\
                    and os.path.exists(input_mol2_path_dbase + '.gz'):
                input_mol2_path_dbase += '.gz'

            output_mol2_path_query = os.path.join(
                output_dir,
                os.path.basename(c_out).replace('_charge.tsv', '_query.mol2'))
            output_mol2_path_dbase = output_mol2_path_query.replace(
                '_query.mol2', '_dbase.mol2')

            if input_mol2_path_query.endswith('.gz'):
                output_mol2_path_query += '.gz'
                query_write_mode = 'wb'
                query_open_file = gzip.open
            else:
                query_write_mode = 'w'
                query_open_file = open
            if input_mol2_path_dbase.endswith('.gz'):
                output_mol2_path_dbase += '.gz'
                dbase_write_mode = 'wb'
                dbase_open_file = gzip.open
            else:
                dbase_write_mode = 'w'
                dbase_open_file = open

            with query_open_file(output_mol2_path_query, query_write_mode) as opq,\
                    dbase_open_file(output_mol2_path_dbase, dbase_write_mode) as opd:
                for i in selection_indices:

                    mol2_q_cont = ('DID NOT FIND %s\n' %
                                   (df_atom.ix[i]['query']))

                    mol2_d_cont = ('DID NOT FIND %s\n' %
                                   (df_atom.ix[i]['dbase']))

                    for idx, mol2 in enumerate(
                            split_multimol2(input_mol2_path_query)):
                        if idx == i:
                            mol2_q_cont = mol2[1]
                            break

                    for idx, mol2 in enumerate(
                            split_multimol2(input_mol2_path_dbase)):
                        if idx == i:
                            mol2_d_cont = mol2[1]
                            break

                    if query_write_mode == 'wb':
                        opq.write(b''.join(mol2_q_cont))
                    else:
                        opq.write(''.join(mol2_q_cont))

                    if dbase_write_mode == 'wb':
                        opd.write(b''.join(mol2_d_cont))
                    else:
                        opd.write(''.join(mol2_d_cont))

        if verbose:
            elapsed = time.time() - start
            n_molecules = mol2_cnt
            sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                             (n_molecules, n_molecules / elapsed))
            sys.stdout.flush()

コード例 #8

0

ファイルを表示

ファイル: script.py プロジェクト: Deepstatsanalysis/iq-research-scripts

def main(config):
    start_time = time.time()

    env_path = config
    load_dotenv(dotenv_path=env_path)

    converted_mols = {}
    pdbs = {}
    input_list = pd.read_csv(os.getenv('INPUT_LIST'), sep=';')

    mols2 = split_multimol2(os.getenv('MOL2_FILE'))

    pdb_path = os.getenv('PDBS_FILE_FOLDER')
    for pdb in os.listdir(pdb_path):
        name = pdb.split('_')[-1].replace('.pdb', '')
        pdbs[name] = PandasPdb().read_pdb('{}/{}'.format(pdb_path, pdb)).df

    for mol2 in mols2:
        pmol = PandasMol2().read_mol2_from_list(mol2_lines=mol2[1],
                                                mol2_code=mol2[0])
        converted_mols[pmol.code.split("|")[1]] = pmol.df

    def create_atom_dimensional_position(x, y, z):
        return np.array((x, y, z))

    def atom_is_close_to_atom(resiude_atom, atom):
        return np.linalg.norm(resiude_atom - atom) <= float(
            os.getenv('MAX_DISTANCE'))

    def remove_hydrogen_atoms(df):
        return df['ATOM'][~df['ATOM']['atom_name'].str.contains('H')]

    def get_interactions_molecule_for_residues(molecule, residues):
        matched_atoms = []
        mol_points = []

        for midx, mol_row in molecule.iterrows():
            mol_points.append(
                create_atom_dimensional_position(
                    x=mol_row['x'],
                    y=mol_row['y'],
                    z=mol_row['z'],
                ))

        for pidx, protein_row in remove_hydrogen_atoms(residues).iterrows():
            protein_tag = '{}_{}'.format(protein_row['residue_name'],
                                         protein_row['residue_number'])

            protein_ad = create_atom_dimensional_position(
                x=protein_row['x_coord'],
                y=protein_row['y_coord'],
                z=protein_row['z_coord'])

            for point in mol_points:
                if atom_is_close_to_atom(
                        protein_ad,
                        point) and protein_tag not in matched_atoms:
                    matched_atoms.append(protein_tag)

        return matched_atoms

    def interate_with_expected_residues(item, expected_residues):
        print('> {}'.format(item['NAME']))

        resp = []
        m = converted_mols[item['NAME']]
        r = pdbs[str(int(item['Gold.Ensemble.ID']))]

        interactions = get_interactions_molecule_for_residues(m, r)

        return [key for key in interactions if key in expected_residues]

    pre_dataframe = pd.DataFrame(columns=[
        'molecule_name', 'pdb', 'score', 'residues', 'residues_quantity'
    ])

    print('Start Molecule Analyze')
    for i, row in input_list.iterrows():
        reactions = interate_with_expected_residues(
            row, list(os.getenv('RESIDUES').split(",")))
        pre_dataframe = pre_dataframe.append(
            {
                'molecule_name': row['NAME'],
                'pdb': row['Gold.Ensemble.ID'],
                'score': float(row['Gold.Chemscore.Fitness']),
                'residues': ", ".join(reactions),
                'residues_quantity': len(reactions),
            },
            ignore_index=True)

    dataframe = pd.DataFrame(columns=[
        'molecule_name', 'v', 'pdb', 'score', 'residues', 'residues_quantity'
    ])
    cached_mol_names = []

    for i, row in pre_dataframe.iterrows():
        new_name = re.sub(r'[A-Z]', '', row['molecule_name'])
        _var = re.search(r'[A-Z]', row['molecule_name'])
        _var = _var.group(0) if _var else ''

        if new_name in cached_mol_names:
            old_score = float(dataframe.loc[dataframe['molecule_name'] ==
                                            new_name]['score'].values[0])
            idx = dataframe.index[dataframe['molecule_name'] == new_name][0]

            if old_score < row['score']:
                dataframe.loc[idx, 'v'] = _var
                dataframe.loc[idx, 'pdb'] = row['pdb']
                dataframe.loc[idx, 'score'] = row['score']
                dataframe.loc[idx, 'residues'] = row['residues']
                dataframe.loc[idx,
                              'residues_quantity'] = row['residues_quantity']
        else:
            cached_mol_names.append(new_name)
            dataframe = dataframe.append(
                {
                    'molecule_name': new_name,
                    'v': _var,
                    'pdb': row['pdb'],
                    'score': row['score'],
                    'residues': row['residues'],
                    'residues_quantity': row['residues_quantity'],
                },
                ignore_index=True)

    body = dataframe.sort_values(by=['residues_quantity'],
                                 ascending=False).to_csv(index=False, sep=';')

    file = open(os.getenv('OUTPUT_LIST'), "w")
    file.write(body)
    file.close()

    print("Process take: %s seconds" % (time.time() - start_time))