def get_smiles_from_csd():
    ''' Read each CSD identifier and save the smiles  '''
    co_crystals = pd.read_csv('datasets/train_data/cocrystals2020.csv',
                              encoding='latin1')
    co_crystals = co_crystals.iloc[:, :]
    #print(co_crystals.csd_id)
    smiles1 = []
    smiles2 = []
    year = []
    for i in co_crystals.csd_id.values:
        #print(i)
        csd = MoleculeReader('CSD')
        csd_reader = io.EntryReader('CSD')
        year.append(csd_reader.entry(i).publication.year)
        mol = csd.molecule(i)
        smi = mol.smiles
        smi = smi.split('.')
        smi = Remove(smi)
        smiles1.append(smi[0])
        smiles2.append(smi[1])
        #print(len(smiles1))
    #cocrystal_data = pd.concat([co_crystals , pd.DataFrame(smiles1, columns=['smiles1']), pd.DataFrame(smiles2, columns=['smiles2']),
    #pd.DataFrame(year, columns=['year'])], axis=1)
    #cocrystal_data.to_csv('datasets/train_data/all_cocrystals_info.csv')
    return co_crystals, smiles1, smiles2
Exemple #2
0
def create_het_chembl_mapping_dict(filename= None):
    '''
    read sdf overlays file of PDB hetcodes for each targets and find out chembl id for each hetcodes
    :param filename: sdf file for overlays
    :return: dictionary of hetcode and chembl id
    '''

    sdf_reader = io.EntryReader(filename)
    id_list = []
    for sdf in sdf_reader:
        id_list.append((sdf.molecule.identifier))

    het_list = []

    for id in id_list:
        het_list.append((id.split("_")[2]))

    het_chembl_mapping_dict = {}
    for het in het_list:
        compound_url = 'https://www.ebi.ac.uk/pdbe/api/pdb/compound/mappings/{}'.format(het)
        pdbe_qeury = SearchAPI(search_url=compound_url)
        pdb_list_dict = pdbe_qeury.run_search()
        if pdb_list_dict:
            for k, v in pdb_list_dict.items():
                for i in v:
                    if i.get('chembl_id'):
                        het_chembl_mapping_dict[i.get('chembl_id')] = het
    return het_chembl_mapping_dict
    def __init__(self, cds_ids: List[str]) -> None:
        """Parses CSD structures for oxidation states

        Args:
            cds_ids (List[str]): list of CSD database identifiers

        Returns:
            None
        """
        # Set up dictionaries and regex
        self.symbol_name_dict = SymbolNameDict().get_symbol_name_dict()
        self.name_symbol_dict = {
            v: k
            for k, v in self.symbol_name_dict.items()
        }
        symbol_regex = "|".join(list(self.symbol_name_dict.values()))
        self.symbol_regex = re.compile(symbol_regex)
        self.regex = re.compile("((?:{})\\([iv0]+\\))".format(symbol_regex),
                                re.IGNORECASE)
        self.not_ox_regex = re.compile("((?:{})[^\\(]*$)".format(symbol_regex),
                                       re.IGNORECASE)
        self.negative_regex = re.compile(
            "((?:{})\\(-[1234567890]+\\))".format(symbol_regex), re.IGNORECASE)
        self.csd_ids = cds_ids
        self.csd_reader = io.EntryReader("CSD")
Exemple #4
0
def generate_id_list(num_samples=1009141):
    """Sample some random entries from the CSD"""
    ids = []
    csd_reader = io.EntryReader('CSD')
    idxs = random.sample(list(range(len(csd_reader))), num_samples)
    for idx in idxs:
        ids.append(csd_reader[idx].identifier)
    return ids
Exemple #5
0
def extractCrystalCif(filepath):
    """
    TODO: Can i store molecule information inside and that'll be in crystal?
    :param filepath:
    :return:
    """
    reader = io.EntryReader(filepath + ".cif")
    entry_from_cif = reader[0]
    logger.info(entry_from_cif.crystal)
    displayCrystal(entry_from_cif.crystal)
    return entry_from_cif.crystal
Exemple #6
0
    def get_neighbor_atoms(self, query_atom, path_mol2_file_dir):
        """获取配位原子

        :param query_atom:金属元素,Sr,K,Na等,type:string
        :param path_mol2_file_dir: *.mol2文件所在的绝对路径,type:string
        :return: 配位原子名称及数量,type:dict
        """

        # 重新定义self.entry_reader,从mol2文件中读取文件,保存为entry
        list_entry = []
        list_mol2_file_path = glob.glob(
            os.path.join(path_mol2_file_dir, '*.mol2'))
        for path_temp in list_mol2_file_path:
            entry_temp = io.EntryReader(path_temp)[0]
            list_entry.append(entry_temp)
        self.entry_reader = list_entry.copy()

        # 创建搜索的原子,此处为Na/Mg/Cr/.../Sr
        s = search.QuerySubstructure()
        q = QueryAtom(query_atom)
        s.add_atom(q)

        # 搜索neighbors
        entry_reader = self.entry_reader
        dict_ligating_atom_statistics = dict()  # 用于储存金属原子的配位原子类型及数量
        pbar = tqdm(range(len(entry_reader)))
        for count in pbar:
            set_ligating_atom = set()  # 空集合,用于存放该分子中所涉及的配位原子类型
            mol = entry_reader[count].crystal.molecule
            # 找出该晶体文件中金属原子的配位原子
            for atom in mol.atoms:
                bool_judge = s.match_atom(atom)  # 判断该原子是否是金属原子
                # 若和金属原子匹配
                if bool_judge:
                    neighbors = atom.neighbours  # 寻找其配位原子
                    # 若配位原子不存在,则跳过
                    if len(neighbors) == 0:
                        pass
                    # 配位原子不为0,则将配位原子增加到对应的集合当中
                    else:
                        for neighbour in neighbors:
                            set_ligating_atom.add(neighbour.atomic_symbol)

            # 对金属原子的配位原子数统计
            for element in set_ligating_atom:
                # 字典中若存在该element,则计数增加1
                try:
                    dict_ligating_atom_statistics[element] += 1
                # 字典中若不存在该element,则新建该element,并计数1
                except BaseException:
                    dict_ligating_atom_statistics[element] = 1
            pbar.set_description('正在统计配位原子:')

        return dict_ligating_atom_statistics
Exemple #7
0
def get_smile_from_CSD(dataframe):
    nbrStruc = dataframe.shape[0]
    from ccdc import io
    csd_reader = io.EntryReader('CSD')
    for i in range(nbrStruc):
        filename = dataframe.iloc[i,0]
        print(filename)
        mol = csd_reader.molecule(filename)
        smile= mol.smiles
        print(smile)
        dataframe.at[i,"smile"] = smile
    return(dataframe)
Exemple #8
0
    def __get_crystal(self):
        count = 0
        list_entry = []
        entry_reader = io.EntryReader('CSD')

        if self.crystal_number is None:
            return entry_reader
        else:
            for i in entry_reader:
                list_entry.append(i)
                count = count + 1
                if count > self.crystal_number:
                    break

        return list_entry
Exemple #9
0
def main():
    # oxidation_parse_dict = load_pickle(
    #   "/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/oxidation_state_book/data/20190820-173457-csd_ox_parse_output.pkl"
    #)
    oxidation_reference_dict = load_pickle(
        '/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/mine_csd/20190921-142007-csd_ox_parse_output_reference.pkl'
    )

    database_ids = list(oxidation_reference_dict.keys())
    csd_reader = io.EntryReader('CSD')
    formula_dicts = {}
    for database_id in database_ids:
        formula_dicts[database_id] = get_chemical_formula(csd_reader, database_id)

    timestr = time.strftime('%Y%m%d-%H%M%S')
    output_name = '-'.join([timestr, 'get_chemical_formulas'])
    with open(output_name + '.pkl', 'wb') as filehandle:
        pickle.dump(formula_dicts, filehandle)
Exemple #10
0
    def __init__(self):
        #Hit results from search
        self.searchHits = []

        #Config for search
        self.overallQuery = None
        self.searchCrystal = crystal.Crystal

        self.entryReader = io.EntryReader('CSD')
                    #TODO: tmp just set default centring
        self.searchCrystal.lattice_centring =  ChemistryLib.Spacegroup_centring_text().text(1)


        #Lattice Params
        self.cellAngles = None #crystal.CellAngles()
        self.cellLengths = None #     crystal.CellLengths()
        self.angTol = None
        self.lengthTol = None

        #Filtering After Search
        self.refcode = None
        self.ccdcNumber = None
        self.atomicElements = []

        """
        Functional Assignment
        """
        self._unitCellQuery = unitCellQuery

        self._searchCellVals = searchCellVals

        #self.cifQueryPath = cellLib.extractCrystalCif

        self._strDetails = query

        """
        Filtering that can be performed on the hits
        """
        self.filterOnSpacegroup = None
Exemple #11
0
from ccdc import io
import numpy as np
import pandas as pd


def activity(identifier):
    if "ZINC" in identifier:
        return 0
    else:
        return 1


entries = io.EntryReader("hotspot_scored.sdf")

idents = [entry.identifier for entry in entries]
act = [activity(entry.identifier) for entry in entries]

hs1 = [
    float(
        np.sum([
            float(entry.attributes["acceptor"]),
            float(entry.attributes["donor"]),
            float(entry.attributes["apolar"])
        ])) for entry in entries
]

# hs2 = [entry.attributes["hs2_total"] for entry in entries]
hs2 = [
    float(
        np.sum([
            float(entry.attributes["hs2_acceptor"]),
Exemple #12
0
def get_bibliometric_information(cif_path, ):  # pylint:disable=too-many-locals, too-many-statements, too-many-branches
    """
    Assumes that the filename is the CSD-key.
    Assumes you are in the EPFL VPN.
    Uses my scopus API key.

    Args:
        cif_path:

    Returns:

    """

    stem = Path(cif_path).stem.upper()

    deposition_number = None
    title = None
    citations = None
    url = None
    abstract = None
    funding = None
    doi_ccsd = None
    doi_paper = None
    pages = None
    journal = None
    affilations = None
    year = None
    disorder_csd = None
    authors = None
    uv_regex_result = None
    photo_regex_result = None
    electronic_regex_result = None
    csd_remarks = None
    csd_has_disorder = None
    chemical_name = None
    formula = None

    try:  # pylint:disable=too-many-nested-blocks
        csd_reader = io.EntryReader('CSD')

        reader = csd_reader.entry(stem)
        disorder_csd = reader.disorder_details
        doi_paper = reader.publication.doi
        csd_has_disorder = reader.has_disorder
        doi_ccsd = reader.doi

        deposition_number = reader.ccdc_number
        chemical_name = reader.chemical_name
        formula = reader.formula

        uv_regex_list = ['uv', 'uv-vis', 'vis']
        uv_regex = re.compile('|'.join(uv_regex_list), re.IGNORECASE)
        uv_regex_result = False

        photo_regex_list = ['photo', 'absorp', 'light', 'lumin']
        photo_regex = re.compile('|'.join(photo_regex_list), re.IGNORECASE)
        photo_regex_result = False

        electronic_regex_list = ['electronic', 'conduc']
        electronic_regex = re.compile('|'.join(electronic_regex_list),
                                      re.IGNORECASE)
        electronic_regex_result = False

        if doi_paper:
            logger.info('found DOI %s', doi_paper)
            works = Works()
            query_res = works.doi(doi_paper)
            if query_res:

                if 'title' in query_res.keys():
                    if len(query_res['title']) > 0:
                        title = query_res['title'][-1]
                    else:
                        title = query_res['title']

                if 'is-referenced-by-count' in query_res.keys():
                    citations = query_res['is-referenced-by-count']

                if 'link' in query_res.keys():
                    if len(query_res['link']) > 0:
                        url = query_res['link'][0]['URL']
                    else:
                        url = query_res['link']

                if 'abstract' in query_res.keys():
                    abstract = query_res['abstract']

                funding_sublis = []
                if 'funder' in query_res.keys():
                    for f in query_res['funder']:
                        funding_sublis.append(f['name'])

                    funding = funding_sublis
                else:
                    funding = None

                if 'page' in query_res.keys():
                    pages = query_res['page']

                if 'container-title' in query_res.keys():
                    if len(query_res['container-title']) > 0:
                        journal = query_res['container-title'][0]
                    else:
                        journal = query_res['container-title']

                if 'author' in query_res.keys():
                    author_sublist = []
                    for a in query_res['author']:
                        author_sublist.append(a['family'])
                    authors = author_sublist

                    affiliation_sublist = []
                    for a in query_res['author']:
                        affiliation_sublist.append(a['affiliation'])

                if abstract is None:
                    search_result = scopus.search(doi_paper)
                    if len(search_result) > 0:
                        try:
                            abstract = scopus.retrieve_abstract(
                                search_result['scopus_id'].values[0]
                            )['abstract']
                        except Exception:
                            abstract = None

        if abstract:
            electronic_regex_result = re.findall(electronic_regex, abstract)
            if len(electronic_regex_result) > 0:
                electronic_regex_result = True

            uv_regex_result = re.findall(uv_regex, abstract)
            if len(uv_regex_result) > 0:
                uv_regex_result = True

            photo_regex_result = re.findall(photo_regex, abstract)
            if len(photo_regex_result) > 0:
                photo_regex_result = True

    except Exception:
        logger.info('Could not retrieve CSD info')

        result_dict = {
            'deposition_number': deposition_number,
            'title': title,
            'csd_abbrv': stem,
            'citations': citations,
            'url': url,
            'abstract': abstract,
            'funding': funding,
            'doi_ccsd': doi_ccsd,
            'doi_paper': doi_paper,
            'formula': formula,
            'pages': pages,
            'journal': journal,
            'affilations': affilations,
            'year': year,
            'disorder_csd': disorder_csd,
            'authors': authors,
            'remarks': csd_remarks,
            'csd_has_disorder': csd_has_disorder,
            'chemical_name': chemical_name,
            'uv_regex_result': uv_regex_result,
            'photo_regex_result': photo_regex_result,
            'electronic_regex_result': electronic_regex_result,
        }

    else:
        result_dict = {
            'deposition_number': deposition_number,
            'title': title,
            'csd_abbrv': stem,
            'citations': citations,
            'url': url,
            'abstract': abstract,
            'funding': funding,
            'doi_ccsd': doi_ccsd,
            'doi_paper': doi_paper,
            'formula': formula,
            'pages': pages,
            'journal': journal,
            'affilations': affilations,
            'year': year,
            'disorder_csd': disorder_csd,
            'authors': authors,
            'remarks': csd_remarks,
            'csd_has_disorder': csd_has_disorder,
            'chemical_name': chemical_name,
            'uv_regex_result': uv_regex_result,
            'photo_regex_result': photo_regex_result,
            'electronic_regex_result': electronic_regex_result,
        }

    return result_dict
Exemple #13
0
def csd():
    global _csd
    if _csd is None:
        _csd = io.EntryReader('csd')
    return _csd
Exemple #14
0
from hotspots.pharmacophore_extension import LigandPharmacophoreModel
from ccdc import io

csd = io.EntryReader('CSD')
crystal = csd.crystal('IBPRAC')
crystal.molecule.add_hydrogens()
ligand_pharmacophore = LigandPharmacophoreModel()

ligand_pharmacophore.feature_definitions = [
    "acceptor_projected", "donor_projected", "ring_planar_projected"
]

ligand_pharmacophore.detect_from_ligand(crystal)
ligand_pharmacophore.pymol_visulisation()
Exemple #15
0
import pandas as pd

print(pd.__file__)

from ccdc import io

csd_reader = io.EntryReader('CSD')
entry_abebuf = csd_reader.entry('ABEBUF')
cryst_abebuf = csd_reader.crystal('ABEBUF')
mol_abebuf = csd_reader.molecule('ABEBUF')

print(round(mol_abebuf.molecular_weight, 3))

reader_formats = io.MoleculeReader.known_formats.keys()
reader_formats.sort()
for format in reader_formats:
    print format

first_molecule = csd_reader[0]
print first_molecule.identifier
ababub = csd_reader.entry('ABABUB')
mol = ababub.molecule
print len(mol.atoms)
print mol.formula

for i in range(11):
    mol = csd_reader[i]
    print mol.identifier

mol = csd_reader.molecule('ABEBUF')
size = len(mol.atoms)
Exemple #16
0
#CCDC imports
from ccdc import io, search, molecule
from ccdc import crystal
from ccdc.io import MoleculeReader
from ccdc.io import EntryReader
from ccdc._lib import ChemistryLib
from ccdc.search import ReducedCellSearch
from ccdc.search import SimilaritySearch
from ccdc.search import TextNumericSearch
from ccdc.diagram import DiagramGenerator

fileName = os.path.basename(sys.argv[0])
fileName = fileName[:-3]

#Configuration CSD
entryReader = io.EntryReader('CSD')

#--    LOGGER CONFIGURATION     --#
logger = logging.getLogger(fileName)
logger.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler('/tmp/spamcellsearcher.log')
fh.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
import glob,os,sys,time
sys.path.append("../../affinityDB/")
import database
from ccdc import io
import ccdc

db_root = "/home/maksym/Projects/datasets/CSD1/"
pdb_folder = "pdbs"
afdb_file = "CSD1.db"
afdb = database.AffinityDB(os.path.join(db_root,afdb_file))
out_q,stop_event = afdb.open_table_with_queue(table_name="some_table",
                                              col_names=["ccdc_id","filename","SMILES"],
                                              col_types=[str,str,str])

# Creating a CSD entry reader
csd_entry_reader = io.EntryReader('CSD')

# Create a CSD entry reader including any updates
directory = ccdc.io.csd_directory()
csd_and_updates = glob.glob(os.path.join(directory, '*.inf'))
csd_and_updates_reader = io.EntryReader(csd_and_updates)


#for i in range(1000):
#    out_q.put(["srandom text"])

exceptions = []
i = 0
for mol in csd_entry_reader.molecules():
    start = time.time()
    try: