Exemple #1
0
def extract_kappa_descriptors(dataframe, column):
    """
    Extracting molecular kappa descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Kappa
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_kappa.csv') and os.access(
            'data/df_kappa.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting kappa calculation")
        diction = []
        for line in dataframe[column][:]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = kappa.GetKappa(mol)
            diction.append(dic)
        df_kappa = pd.DataFrame(diction,
                                columns=[
                                    'phi', 'kappa1', 'kappa3', 'kappa2',
                                    'kappam1', 'kappam3', 'kappam2'
                                ])

        df_kappa.to_csv('data/df_kappa.csv')
        print("done calculating kappa")
        return
Exemple #2
0
def get_chemopy_props_from_smilesfile(f):
    """ Generates properties from a SMILES file. The expected formatting for the 
    SMILES file is as follows (without headers):

    'name_1','CCCCC'
    'name_2','Ar'
    """

    smilesf = data = pd.read_csv(f, sep=',', encoding='utf-8', header=None)

    properties = list()
    try:
        for i, row in smilesf.iterrows():
            mol = Chem.MolFromSmiles(row[0])
            name = str(row[1])

            props = {'name': name}
            try:
                props.update(generate_chemopy_props(mol))
            except:
                logging.error("Properties could not be generated for: " + name)

            properties.append(props)
    except KeyError:
        raise Exception(
            "Please ensure that the input data is in the correct format. See docs for more info."
        )

    return properties
Exemple #3
0
def extract_constitution_descriptors(dataframe, column):
    """
    Extracting molecular constitution descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which
    constitution descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_constitution.csv') and os.access(
            'data/df_constitution.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting constitution calculation")
        diction = []
        for line in dataframe[column][:]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = constitution.GetConstitutional(mol)
            diction.append(dic)
        df_constitution = pd.DataFrame(
            diction,
            columns=[
                "nphos", "ndb", "nsb", "ncoi", "ncarb", "nsulph", "ncof",
                "nnitro", "ncobr", "naro", "ndonr", "noxy", "nhet", "nhev",
                "nhal", "naccr", "nta", "ntb", "nring", "nrot", "Weight",
                "PC2", "PC3", "PC1", "PC6", "PC4", "PC5", "AWeight", "ncocl",
                "nhyd"
            ])

        df_constitution.to_csv('data/df_constitution.csv')
        print("done calculating constitution")
        return
Exemple #4
0
def extract_property_descriptors(dataframe, column):
    """
    Extracting molecular property descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which property
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_property.csv') and os.access(
            'data/df_property.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting property calculation")
        diction = []
        for line in dataframe[column]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = mp.GetMolecularProperty(mol)
            diction.append(dic)
        df_property = pd.DataFrame(
            diction, columns=['TPSA', 'Hy', 'LogP', 'LogP2', 'UI', 'MR'])

        df_property.to_csv('data/df_property.csv')
        print("done calculating property")
        return
Exemple #5
0
def extract_charge_descriptors(dataframe, column):
    """
    Extracting molecular charge descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which charge
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_charge.csv') and os.access(
            'data/df_charge.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting charge calculation")
        diction = []
        for line in dataframe[column][:]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = charge.GetCharge(mol)
            diction.append(dic)
        df_charge = pd.DataFrame(diction,
                                 columns=[
                                     'QNmin', 'QOss', 'Mpc', 'QHss', 'SPP',
                                     'LDI', 'QCmin', 'Mac', 'Qass', 'QNss',
                                     'QCmax', 'QOmax', 'Tpc', 'Qmax', 'QOmin',
                                     'Tnc', 'QHmin', 'QCss', 'QHmax', 'QNmax',
                                     'Rnc', 'Rpc', 'Qmin', 'Tac', 'Mnc'
                                 ])

        df_charge.to_csv('data/df_charge.csv')
        print("done calculating charge")
        return
Exemple #6
0
def generate_chemopy_props_from_smiles(smiles):
    """ Generated properties from an RDKit Mol Object """

    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        raise Exception("Please check if the SMILES is formatted correctly.")

    return generate_chemopy_props(mol)
def extract_moe_descriptors(dataframe, column, url):
    """
    Extracting molecular MOE-type descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which MOE
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting moe calculation")
        diction = []
        columns = [
            'EstateVSA8', 'EstateVSA9', 'EstateVSA4', 'EstateVSA5',
            'EstateVSA6', 'EstateVSA7', 'EstateVSA0', 'EstateVSA1',
            'EstateVSA2', 'EstateVSA3', 'PEOEVSA13', 'PEOEVSA12', 'PEOEVSA11',
            'PEOEVSA10', 'VSAEstate0', 'VSAEstate1', 'VSAEstate2',
            'VSAEstate3', 'VSAEstate4', 'VSAEstate5', 'VSAEstate6',
            'VSAEstate7', 'VSAEstate8', 'LabuteASA', 'PEOEVSA3', 'PEOEVSA2',
            'PEOEVSA1', 'PEOEVSA0', 'PEOEVSA7', 'PEOEVSA6', 'PEOEVSA5',
            'PEOEVSA4', 'MRVSA5', 'MRVSA4', 'PEOEVSA9', 'PEOEVSA8', 'MRVSA1',
            'MRVSA0', 'MRVSA3', 'MRVSA2', 'MRVSA9', 'TPSA1', 'slogPVSA10',
            'slogPVSA11', 'MRVSA8', 'MRVSA7', 'MRVSA6', 'EstateVSA10',
            'slogPVSA2', 'slogPVSA3', 'slogPVSA0', 'slogPVSA1', 'slogPVSA6',
            'slogPVSA7', 'slogPVSA4', 'slogPVSA5', 'slogPVSA8', 'slogPVSA9',
            'VSAEstate9', 'VSAEstate10'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print('moe ', i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = moe.GetMOE(mol)
            diction.append(dic)
        df_moe = pd.DataFrame(diction, columns=columns)
        df_moe.to_csv('../data/df_moe.csv')
        print("Done calculating moe")

        return
def extract_burden_descriptors(dataframe, column, url):
    """
    Extracting molecular burden descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Burden
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting burden calculation")
        diction = []
        columns = [
            'bcutp8', 'bcutm9', 'bcutp9', 'bcutp5', 'bcutp6', 'bcutm8',
            'bcutp1', 'bcutp2', 'bcutp3', 'bcutm7', 'bcute9', 'bcutv8',
            'bcutv9', 'bcutv6', 'bcutm6', 'bcutv4', 'bcutm4', 'bcutm3',
            'bcutm5', 'bcutm1', 'bcutv1', 'bcutv5', 'bcute8', 'bcutv2',
            'bcutm2', 'bcutp4', 'bcute3', 'bcutv14', 'bcutv15', 'bcutv16',
            'bcutv10', 'bcutv11', 'bcutv12', 'bcutv13', 'bcutp7', 'bcutp16',
            'bcutp14', 'bcutp15', 'bcutp12', 'bcutp13', 'bcutp10', 'bcutp11',
            'bcute16', 'bcute15', 'bcute14', 'bcute13', 'bcute12', 'bcute11',
            'bcute10', 'bcutv3', 'bcute7', 'bcute6', 'bcute5', 'bcute4',
            'bcutv7', 'bcute2', 'bcute1', 'bcutm16', 'bcutm15', 'bcutm14',
            'bcutm13', 'bcutm12', 'bcutm11', 'bcutm10'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("burden ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = bcut.GetBurden(mol)
            diction.append(dic)
        df_burden = pd.DataFrame(diction, columns=columns)
        df_burden.to_csv('../data/df_burden.csv')
        print("Done calculating burden")

        return
Exemple #9
0
def extract_burden_descriptors(dataframe, column):
    """
    Extracting molecular burden descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Burden
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_burden.csv') and os.access(
            'data/df_burden.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting burden calculation")
        diction = []
        i = 0
        for line in dataframe[column][:]:
            smiles = line
            i += 1
            print "burden"
            print i
            mol = Chem.MolFromSmiles(smiles)
            dic = bcut.GetBurden(mol)
            diction.append(dic)
        df_burden = pd.DataFrame(
            diction,
            columns=[
                'bcutp8', 'bcutm9', 'bcutp9', 'bcutp5', 'bcutp6', 'bcutm8',
                'bcutp1', 'bcutp2', 'bcutp3', 'bcutm7', 'bcute9', 'bcutv8',
                'bcutv9', 'bcutv6', 'bcutm6', 'bcutv4', 'bcutm4', 'bcutm3',
                'bcutm5', 'bcutm1', 'bcutv1', 'bcutv5', 'bcute8', 'bcutv2',
                'bcutm2', 'bcutp4', 'bcute3', 'bcutv14', 'bcutv15', 'bcutv16',
                'bcutv10', 'bcutv11', 'bcutv12', 'bcutv13', 'bcutp7',
                'bcutp16', 'bcutp14', 'bcutp15', 'bcutp12', 'bcutp13',
                'bcutp10', 'bcutp11', 'bcute16', 'bcute15', 'bcute14',
                'bcute13', 'bcute12', 'bcute11', 'bcute10', 'bcutv3', 'bcute7',
                'bcute6', 'bcute5', 'bcute4', 'bcutv7', 'bcute2', 'bcute1',
                'bcutm16', 'bcutm15', 'bcutm14', 'bcutm13', 'bcutm12',
                'bcutm11', 'bcutm10'
            ])

        df_burden.to_csv('data/df_burden.csv')
        print("done calculating burden")
        return
def extract_con_descriptors(dataframe, column, url):
    """
    Extracting molecular connectivity descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which
    connectivity descriptors info must be evaluated.
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe.
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting con calculation")
        diction = []
        columns = [
            'Chi3ch', 'knotp', 'dchi3', 'dchi2', 'dchi1', 'dchi0', 'Chi5ch',
            'Chiv4', 'Chiv7', 'Chiv6', 'Chiv1', 'Chiv0', 'Chiv3', 'Chiv2',
            'Chi4c', 'dchi4', 'Chiv4pc', 'Chiv3c', 'Chiv8', 'Chi3c', 'Chi8',
            'Chi9', 'Chi2', 'Chi3', 'Chi0', 'Chi1', 'Chi6', 'Chi7', 'Chi4',
            'Chi5', 'Chiv5', 'Chiv4c', 'Chiv9', 'Chi4pc', 'knotpv', 'Chiv5ch',
            'Chiv3ch', 'Chiv10', 'Chiv6ch', 'Chi10', 'Chi4ch', 'Chiv4ch',
            'mChi1', 'Chi6ch'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("con ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = con.GetConnectivity(mol)
            diction.append(dic)
        df_con = pd.DataFrame(diction, columns=columns)
        df_con.to_csv('../data/df_con.csv')
        print("Done calculating con")

        return
def extract_geary_descriptors(dataframe, column, url):
    """
    Extracting molecular geary descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Geary
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting geary calculation")
        diction = []
        columns = [
            'GATSp8', 'GATSv3', 'GATSv2', 'GATSv1', 'GATSp6', 'GATSv7',
            'GATSv6', 'GATSv5', 'GATSv4', 'GATSe2', 'GATSe3', 'GATSv8',
            'GATSe6', 'GATSe7', 'GATSe4', 'GATSe5', 'GATSp5', 'GATSp4',
            'GATSp7', 'GATSe1', 'GATSp1', 'GATSp3', 'GATSp2', 'GATSe8',
            'GATSm2', 'GATSm3', 'GATSm1', 'GATSm6', 'GATSm7', 'GATSm4',
            'GATSm5', 'GATSm8'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print('geary ', i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = geary.GetGearyAuto(mol)
            diction.append(dic)
        df_geary = pd.DataFrame(diction, columns=columns)
        df_geary.to_csv('../data/df_geary.csv')
        print("Done calculating geary")

        return
Exemple #12
0
def extract_moe_descriptors(dataframe, column):
    """
    Extracting molecular MOE-type descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which MOE
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_moe.csv') and os.access('data/df_moe.csv',
                                                       os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting moe calculation")
        diction = []
        for line in dataframe[column][:]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = moe.GetMOE(mol)
            diction.append(dic)
        df_moe = pd.DataFrame(
            diction,
            columns=[
                'EstateVSA8', 'EstateVSA9', 'EstateVSA4', 'EstateVSA5',
                'EstateVSA6', 'EstateVSA7', 'EstateVSA0', 'EstateVSA1',
                'EstateVSA2', 'EstateVSA3', 'PEOEVSA13', 'PEOEVSA12',
                'PEOEVSA11', 'PEOEVSA10', 'VSAEstate0', 'VSAEstate1',
                'VSAEstate2', 'VSAEstate3', 'VSAEstate4', 'VSAEstate5',
                'VSAEstate6', 'VSAEstate7', 'VSAEstate8', 'LabuteASA',
                'PEOEVSA3', 'PEOEVSA2', 'PEOEVSA1', 'PEOEVSA0', 'PEOEVSA7',
                'PEOEVSA6', 'PEOEVSA5', 'PEOEVSA4', 'MRVSA5', 'MRVSA4',
                'PEOEVSA9', 'PEOEVSA8', 'MRVSA1', 'MRVSA0', 'MRVSA3', 'MRVSA2',
                'MRVSA9', 'TPSA1', 'slogPVSA10', 'slogPVSA11', 'MRVSA8',
                'MRVSA7', 'MRVSA6', 'EstateVSA10', 'slogPVSA2', 'slogPVSA3',
                'slogPVSA0', 'slogPVSA1', 'slogPVSA6', 'slogPVSA7',
                'slogPVSA4', 'slogPVSA5', 'slogPVSA8', 'slogPVSA9',
                'VSAEstate9', 'VSAEstate10'
            ])

        df_moe.to_csv('data/df_moe.csv')
        print("done calculating moe")
        return
def extract_topology_descriptors(dataframe, column, url):
    """
    Extracting molecular topology descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which topology
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting topology calculation")
        diction = []
        columns = [
            'GMTIV', 'AW', 'Geto', 'DZ', 'Gravto', 'IDET', 'Sitov', 'IDE',
            'TIAC', 'Arto', 'Qindex', 'petitjeant', 'Hatov', 'diametert',
            'BertzCT', 'IVDE', 'ISIZ', 'Platt', 'ZM2', 'Getov', 'ZM1', 'J',
            'radiust', 'Tsch', 'Thara', 'W', 'MZM2', 'GMTI', 'MZM1', 'Ipc',
            'Sito', 'Tigdi', 'Pol', 'Hato', 'Xu'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("topology ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = topology.GetTopology(mol)
            diction.append(dic)
        df_topology = pd.DataFrame(diction, columns=columns)
        df_topology.to_csv('../data/df_topology.csv')
        print("Done calculating topology")

        return
def extract_constitution_descriptors(dataframe, column, url):
    """
    Extracting molecular constitution descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which
    constitution descriptors info must be evaluated.
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe.
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting constitution calculation")
        diction = []
        columns = [
            "nphos", "ndb", "nsb", "ncoi", "ncarb", "nsulph", "ncof", "nnitro",
            "ncobr", "naro", "ndonr", "noxy", "nhet", "nhev", "nhal", "naccr",
            "nta", "ntb", "nring", "nrot", "Weight", "PC2", "PC3", "PC1",
            "PC6", "PC4", "PC5", "AWeight", "ncocl", "nhyd"
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("constitution ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = constitution.GetConstitutional(mol)
            diction.append(dic)
        df_constitution = pd.DataFrame(diction, columns=columns)
        df_constitution.to_csv('../data/df_constitution.csv')
        print("Done calculating constitution")

        return
def extract_charge_descriptors(dataframe, column, url):
    """
    Extracting molecular charge descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which charge
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting charge calculation")
        diction = []
        columns = [
            'QNmin', 'QOss', 'Mpc', 'QHss', 'SPP', 'LDI', 'QCmin', 'Mac',
            'Qass', 'QNss', 'QCmax', 'QOmax', 'Tpc', 'Qmax', 'QOmin', 'Tnc',
            'QHmin', 'QCss', 'QHmax', 'QNmax', 'Rnc', 'Rpc', 'Qmin', 'Tac',
            'Mnc'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print('charge ', i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = charge.GetCharge(mol)
            diction.append(dic)
        df_charge = pd.DataFrame(diction, columns=columns)
        df_charge.to_csv('../data/df_charge.csv')
        print("Done calculating charge")

        return
Exemple #16
0
def get_chemopy_props_from_smilesfile(f):
    smilesf = read_smiles_file(f)
    properties = list()

    try:
        for i, row in smilesf.iterrows():
            mol = Chem.MolFromSmiles(row[1])
            props = {'name': row[0]}

            try:
                props.update(generate_chemopy_props(mol))
            except:
                pass

            properties.append(props)
    except KeyError:
        raise Exception(
            "Please ensure that the input data is in the correct format.")

    return properties
def extract_basak_descriptors(dataframe, column, url):
    """
    Extracting molecular basak descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Basak
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting basak calculation")
        diction = []
        columns = [
            'CIC3', 'CIC6', 'SIC5', 'SIC4', 'SIC6', 'SIC1', 'SIC0', 'SIC3',
            'SIC2', 'CIC5', 'CIC2', 'CIC0', 'CIC4', 'IC3', 'IC2', 'IC1', 'IC0',
            'CIC1', 'IC6', 'IC5', 'IC4'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("basak ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = basak.Getbasak(mol)
            diction.append(dic)
        df_basak = pd.DataFrame(diction, columns=columns)
        df_basak.to_csv('../data/df_basak.csv')
        print("Done calculating basak")

        return
Exemple #18
0
def extract_geary_descriptors(dataframe, column):
    """
    Extracting molecular geary descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Geary
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_geary.csv') and os.access(
            'data/df_geary.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting geary calculation")
        diction = []
        i = 0
        for line in dataframe[column][:]:
            smiles = line
            i += 1
            print "geary"
            print i
            mol = Chem.MolFromSmiles(smiles)
            dic = geary.GetGearyAuto(mol)
            diction.append(dic)
        df_geary = pd.DataFrame(
            diction,
            columns=[
                'GATSp8', 'GATSv3', 'GATSv2', 'GATSv1', 'GATSp6', 'GATSv7',
                'GATSv6', 'GATSv5', 'GATSv4', 'GATSe2', 'GATSe3', 'GATSv8',
                'GATSe6', 'GATSe7', 'GATSe4', 'GATSe5', 'GATSp5', 'GATSp4',
                'GATSp7', 'GATSe1', 'GATSp1', 'GATSp3', 'GATSp2', 'GATSe8',
                'GATSm2', 'GATSm3', 'GATSm1', 'GATSm6', 'GATSm7', 'GATSm4',
                'GATSm5', 'GATSm8'
            ])

        df_geary.to_csv('data/df_geary.csv')
        print("done calculating geary")
        return
def extract_kappa_descriptors(dataframe, column, url):
    """
    Extracting molecular kappa descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Kappa
    descriptors info must be evaluated.
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe.
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting kappa calculation")
        diction = []
        columns = [
            'phi', 'kappa1', 'kappa3', 'kappa2', 'kappam1', 'kappam3',
            'kappam2'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("kappa ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = kappa.GetKappa(mol)
            diction.append(dic)
        df_kappa = pd.DataFrame(diction, columns=columns)
        df_kappa.to_csv('../data/df_kappa.csv')
        print("Done calculating kappa")

        return
Exemple #20
0
def extract_topology_descriptors(dataframe, column):
    """
    Extracting molecular topology descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which topology
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_topology.csv') and os.access(
            'data/df_topology.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting topology calculation")
        diction = []
        i = 0
        for line in dataframe[column][:]:
            smiles = line
            i += 1
            print "topology"
            print i
            mol = Chem.MolFromSmiles(smiles)
            dic = topology.GetTopology(mol)
            diction.append(dic)
        df_topology = pd.DataFrame(
            diction,
            columns=[
                'GMTIV', 'AW', 'Geto', 'DZ', 'Gravto', 'IDET', 'Sitov', 'IDE',
                'TIAC', 'Arto', 'Qindex', 'petitjeant', 'Hatov', 'diametert',
                'BertzCT', 'IVDE', 'ISIZ', 'Platt', 'ZM2', 'Getov', 'ZM1', 'J',
                'radiust', 'Tsch', 'Thara', 'W', 'MZM2', 'GMTI', 'MZM1', 'Ipc',
                'Sito', 'Tigdi', 'Pol', 'Hato', 'Xu'
            ])

        df_topology.to_csv('data/df_topology.csv')
        print("done calculating topology")
        return
def extract_property_descriptors(dataframe, column, url):
    """
    Extracting molecular property descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which property
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting property calculation")
        diction = []
        columns = ['TPSA', 'Hy', 'LogP', 'LogP2', 'UI', 'MR']
        i = 0
        for line in dataframe[column]:
            i += 1
            print('property ', i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = mp.GetMolecularProperty(mol)
            diction.append(dic)
        df_property = pd.DataFrame(diction, columns=columns)
        df_property.to_csv('../data/df_property.csv')
        print("Done calculating property")

        return
Exemple #22
0
def extract_basak_descriptors(dataframe, column):
    """
    Extracting molecular basak descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which Basak
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_basak.csv') and os.access(
            'data/df_basak.csv', os.R_OK):
        print("File exists and is readable")
        return
    else:
        print("starting basak calculation")
        diction = []
        i = 0
        for line in dataframe[column][:]:
            smiles = line
            i += 1
            print i
            mol = Chem.MolFromSmiles(smiles)
            dic = basak.Getbasak(mol)
            diction.append(dic)
        df_basak = pd.DataFrame(diction,
                                columns=[
                                    'CIC3', 'CIC6', 'SIC5', 'SIC4', 'SIC6',
                                    'SIC1', 'SIC0', 'SIC3', 'SIC2', 'CIC5',
                                    'CIC2', 'CIC0', 'CIC4', 'IC3', 'IC2',
                                    'IC1', 'IC0', 'CIC1', 'IC6', 'IC5', 'IC4'
                                ])

        df_basak.to_csv('data/df_basak.csv')
        print("done calculating basak")
        return
Exemple #23
0
def extract_con_descriptors(dataframe, column):
    """
    Extracting molecular connectivity descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which
    connectivity descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_con.csv') and os.access('data/df_con.csv',
                                                       os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting con calculation")
        diction = []
        for line in dataframe[column][:]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = con.GetConnectivity(mol)
            diction.append(dic)
        df_con = pd.DataFrame(
            diction,
            columns=[
                'Chi3ch', 'knotp', 'dchi3', 'dchi2', 'dchi1', 'dchi0',
                'Chi5ch', 'Chiv4', 'Chiv7', 'Chiv6', 'Chiv1', 'Chiv0', 'Chiv3',
                'Chiv2', 'Chi4c', 'dchi4', 'Chiv4pc', 'Chiv3c', 'Chiv8',
                'Chi3c', 'Chi8', 'Chi9', 'Chi2', 'Chi3', 'Chi0', 'Chi1',
                'Chi6', 'Chi7', 'Chi4', 'Chi5', 'Chiv5', 'Chiv4c', 'Chiv9',
                'Chi4pc', 'knotpv', 'Chiv5ch', 'Chiv3ch', 'Chiv10', 'Chiv6ch',
                'Chi10', 'Chi4ch', 'Chiv4ch', 'mChi1', 'Chi6ch'
            ])
        df_con.to_csv('data/df_con.csv')
        print("done calculating con")
        return
Exemple #24
0
import sys
import time
import rdkit
from pychem.pychem import Chem
from pychem import constitution
from pychem import pychem
from pychem.pychem import PyChem2d
i = sys.argv[1]
smi = i
mol = Chem.MolFromSmiles(smi)
#res=constitution.GetConstitutional(mol)
drug = pychem.PyChem2d()
drug.ReadMolFromSmile(smi)
vals = constitution.GetConstitutional(mol)

with open("carb_features.txt", "w") as f:
    #	print>> f, drug.GetMolProperty()
    #	print>> f, constitution.GetConstitutional(mol)
    print >> f, vals['nrot']
    print >> f, mol.GetNumHeavyAtoms()
    print >> f, vals['ndonr']
    #	print>> f, constitution.CalculateMolWeight(mol)
    print >> f, rdkit.Chem.rdMolDescriptors.CalcFractionCSP3(mol)
Exemple #25
0
def extract_estate_descriptors(dataframe, column):
    """
    Extracting molecular E-state descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which E-State
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :return: Descriptor dataframe
    """
    if os.path.exists('data/df_estate.csv') and os.access(
            'data/df_estate.csv', os.R_OK):
        print "File exists and is readable"
        return
    else:
        print("starting estate calculation")
        diction = []
        for line in dataframe[column][:]:
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = estate.GetEstate(mol)
            diction.append(dic)
        df_estate = pd.DataFrame(
            diction,
            columns=[
                'Smax38', 'Smax39', 'Smax34', 'Smax35', 'Smax36', 'Smax37',
                'Smax30', 'Smax31', 'Smax32', 'Smax33', 'S57', 'S56', 'S55',
                'S54', 'S53', 'S52', 'S51', 'S50', 'S32', 'S59', 'S58',
                'Smax8', 'Smax9', 'Smax0', 'Smax1', 'Smax2', 'Smax3', 'Smax4',
                'Smax5', 'Smax6', 'Smax7', 'Smax29', 'Smax28', 'Smax23',
                'Smax22', 'Smax21', 'Smax20', 'Smax27', 'Smax26', 'Smax25',
                'Smax24', 'S44', 'S45', 'S46', 'S47', 'S40', 'S41', 'S42',
                'S43', 'S48', 'S49', 'Smin78', 'Smin72', 'Smin73', 'Smin70',
                'Smin71', 'Smin76', 'Smin77', 'Smin74', 'Smin75', 'S79', 'S78',
                'Smin', 'Smax58', 'Smax59', 'Smax56', 'Smax57', 'S73', 'S72',
                'S75', 'Smax53', 'S77', 'S76', 'Save', 'Smin69', 'Smin68',
                'Shal', 'Smin61', 'Smin32', 'Smin63', 'Smin62', 'Smin65',
                'Smin64', 'Smin67', 'Smin66', 'DS', 'Smin41', 'Smin40',
                'Smax49', 'Smax48', 'S68', 'S69', 'Smax45', 'Smax44', 'Smax47',
                'S65', 'Smax41', 'Smax40', 'Smax43', 'Smax42', 'Smin54',
                'Smax52', 'Smin56', 'Smin57', 'Smin50', 'Smin51', 'Smin52',
                'Smin53', 'Smin58', 'Smin59', 'Shev', 'Shet', 'Scar', 'Smin49',
                'S9', 'S8', 'S3', 'S2', 'S1', 'Smin55', 'S7', 'S6', 'S5', 'S4',
                'Smax78', 'S66', 'S67', 'Smax70', 'Smax71', 'Smax72', 'Smax73',
                'Smax74', 'Smax75', 'Smax76', 'Smax77', 'Smin43', 'Smin42',
                'S19', 'S18', 'Smin47', 'Smin46', 'Smin45', 'Smin44', 'S13',
                'S12', 'S11', 'S10', 'S17', 'S16', 'S15', 'S14', 'S60', 'S64',
                'Smin16', 'S61', 'Smax67', 'Smax66', 'Smax65', 'Smax64',
                'Smax63', 'Smax62', 'Smax61', 'Smax60', 'Smax69', 'Smax68',
                'Smin60', 'Smax', 'Smin36', 'Smin37', 'Smin34', 'Smin35',
                'S62', 'Smin33', 'Smin30', 'Smin31', 'Smin38', 'Smin39',
                'Smax12', 'Smax13', 'Smax10', 'Smax11', 'Smax16', 'Smax17',
                'Smax14', 'Smax15', 'Smin20', 'Smax18', 'Smax19', 'S71', 'S63',
                'S70', 'Smax54', 'Smax55', 'S39', 'S38', 'S35', 'S34', 'S37',
                'S36', 'S31', 'S30', 'S33', 'S74', 'Smin25', 'Smin24',
                'Smin27', 'Smin26', 'Smin21', 'Smax50', 'Smin23', 'Smin22',
                'Smax51', 'Smin29', 'Smin28', 'Smin6', 'Smin7', 'Smin4',
                'Smin5', 'Smin2', 'Smin3', 'Smin0', 'Smin1', 'Smin48', 'Smin8',
                'Smin9', 'S22', 'S23', 'S20', 'S21', 'S26', 'S27', 'S24',
                'S25', 'S28', 'S29', 'Smin10', 'Smin11', 'Smin12', 'Smin13',
                'Smin14', 'Smin15', 'Smax46', 'Smin17', 'Smin18', 'Smin19'
            ])
        df_estate.to_csv('data/df_estate.csv')
        print("done calculating estate")
        return
def extract_estate_descriptors(dataframe, column, url):
    """
    Extracting molecular E-state descriptors using PyChem package and
    SMILES strings of compounds.
    :param dataframe: The dataframe containing SMILES info for which E-State
    descriptors info must be evaluated
    :param column: the column containing SMILES info for the compounds
     in the dataframe.
    :param url: URL to descriptor file in S3 bucket.
    :return: Descriptor dataframe
    """
    try:
        # Check if file exists in this url
        r = urllib2.urlopen(url)
    except urllib2.URLError as e:
        r = e
    if r.code < 400:
        # File already exists in URL
        return
    else:
        # File does not exist in URL
        print("Starting estate calculation")
        diction = []
        columns = [
            'Smax38', 'Smax39', 'Smax34', 'Smax35', 'Smax36', 'Smax37',
            'Smax30', 'Smax31', 'Smax32', 'Smax33', 'S57', 'S56', 'S55', 'S54',
            'S53', 'S52', 'S51', 'S50', 'S32', 'S59', 'S58', 'Smax8', 'Smax9',
            'Smax0', 'Smax1', 'Smax2', 'Smax3', 'Smax4', 'Smax5', 'Smax6',
            'Smax7', 'Smax29', 'Smax28', 'Smax23', 'Smax22', 'Smax21',
            'Smax20', 'Smax27', 'Smax26', 'Smax25', 'Smax24', 'S44', 'S45',
            'S46', 'S47', 'S40', 'S41', 'S42', 'S43', 'S48', 'S49', 'Smin78',
            'Smin72', 'Smin73', 'Smin70', 'Smin71', 'Smin76', 'Smin77',
            'Smin74', 'Smin75', 'S79', 'S78', 'Smin', 'Smax58', 'Smax59',
            'Smax56', 'Smax57', 'S73', 'S72', 'S75', 'Smax53', 'S77', 'S76',
            'Save', 'Smin69', 'Smin68', 'Shal', 'Smin61', 'Smin32', 'Smin63',
            'Smin62', 'Smin65', 'Smin64', 'Smin67', 'Smin66', 'DS', 'Smin41',
            'Smin40', 'Smax49', 'Smax48', 'S68', 'S69', 'Smax45', 'Smax44',
            'Smax47', 'S65', 'Smax41', 'Smax40', 'Smax43', 'Smax42', 'Smin54',
            'Smax52', 'Smin56', 'Smin57', 'Smin50', 'Smin51', 'Smin52',
            'Smin53', 'Smin58', 'Smin59', 'Shev', 'Shet', 'Scar', 'Smin49',
            'S9', 'S8', 'S3', 'S2', 'S1', 'Smin55', 'S7', 'S6', 'S5', 'S4',
            'Smax78', 'S66', 'S67', 'Smax70', 'Smax71', 'Smax72', 'Smax73',
            'Smax74', 'Smax75', 'Smax76', 'Smax77', 'Smin43', 'Smin42', 'S19',
            'S18', 'Smin47', 'Smin46', 'Smin45', 'Smin44', 'S13', 'S12', 'S11',
            'S10', 'S17', 'S16', 'S15', 'S14', 'S60', 'S64', 'Smin16', 'S61',
            'Smax67', 'Smax66', 'Smax65', 'Smax64', 'Smax63', 'Smax62',
            'Smax61', 'Smax60', 'Smax69', 'Smax68', 'Smin60', 'Smax', 'Smin36',
            'Smin37', 'Smin34', 'Smin35', 'S62', 'Smin33', 'Smin30', 'Smin31',
            'Smin38', 'Smin39', 'Smax12', 'Smax13', 'Smax10', 'Smax11',
            'Smax16', 'Smax17', 'Smax14', 'Smax15', 'Smin20', 'Smax18',
            'Smax19', 'S71', 'S63', 'S70', 'Smax54', 'Smax55', 'S39', 'S38',
            'S35', 'S34', 'S37', 'S36', 'S31', 'S30', 'S33', 'S74', 'Smin25',
            'Smin24', 'Smin27', 'Smin26', 'Smin21', 'Smax50', 'Smin23',
            'Smin22', 'Smax51', 'Smin29', 'Smin28', 'Smin6', 'Smin7', 'Smin4',
            'Smin5', 'Smin2', 'Smin3', 'Smin0', 'Smin1', 'Smin48', 'Smin8',
            'Smin9', 'S22', 'S23', 'S20', 'S21', 'S26', 'S27', 'S24', 'S25',
            'S28', 'S29', 'Smin10', 'Smin11', 'Smin12', 'Smin13', 'Smin14',
            'Smin15', 'Smax46', 'Smin17', 'Smin18', 'Smin19'
        ]
        i = 0
        for line in dataframe[column]:
            i += 1
            print("estate ", i)
            smiles = line
            mol = Chem.MolFromSmiles(smiles)
            dic = estate.GetEstate(mol)
            diction.append(dic)
        df_estate = pd.DataFrame(diction, columns=columns)
        df_estate.to_csv('../data/df_estate.csv')
        print("Done calculating estate")

        return