Beispiel #1
0
def main(FilePath, SqliteFile, TableName, IDcolumn='structure_id'):
    conn, c = Sql.Connect(SqliteFile)
    Sql.CreateNewTable(c,
                       TableName,
                       IDcolumn,
                       ColumnNames=[
                           'inchi_key', 'inchi_key_molconvert',
                           'inchi_key_molconvert_neutral'
                       ])
    InsertIntoSQL(c, FilePath, TableName, IDcolumn)
    Sql.Close(conn)
    return True
Beispiel #2
0
def CombineInchiKeys(cursor, table_name, id_column='structure_id'):
    """Combine the 2 seperate inchi-keys in the SQL database
    and add the charge similar to the molconvert charge to one copy
    and a neutral charge to the other.

    Keyword Arguments:
        cursor -- SQL cursor object to interact with the database
        table_name -- Name of the table in the database to edit
        id_column -- Name of the column that contains the structure ids
    """
    # Retrieving the structure_id, inchi-keys and the charges from the
    # molconvert inchi-keys.
    Structures = {}
    EX = {}
    for Row in cursor.execute('SELECT structure_id, inchi_key1,'
                              'inchi_key2, inchi_key_molconvert from ' +
                              table_name):
        Structures[Row[0]] = Row[1] + '-' + Row[2][:8] + 'SA-' + Row[3][-1]

    for key, value in Structures.items():
        EX['inchi_key'] = value
        # Adding the combined inchi-keys with the 'hopefully' correct charge
        WhereString = f"{id_column} = '{key}'"
        Sql.UpdateTable(cursor, table_name, EX, WhereString)
    return None
Beispiel #3
0
def InsertIntoSQL(cursor, file_path, table_name, id_column='structure_id'):
    """Parses a file with (space) seperated NP_ID strings, smiles and
    inchi-keys into a SQlite database.

    Keyword Arguments:
        cursor -- SQL cursor object to interact with the database
        file_path -- Path to the inchi-key file
        table_name -- Name of the table in the database to edit
        id_column -- Name of the column that contains the structure ids
    """
    # Adding molconvert inchi_keys to the database
    EX = {}
    with open(file_path) as f:
        for line in f:
            line = line.split()
            structure_id = line[0]
            # if there only is a neutral smile and inchi-key
            if len(line) == 3:
                inchi_key_molconvert = line[-1]
                inchi_key_molconvert_neutral = inchi_key_molconvert
            # if there is a charged and a neutral smile and inchi-key
            elif len(line) == 5:
                inchi_key_molconvert = line[-1]
                inchi_key_molconvert_neutral = line[-2]
            else:
                print("Encoutered a problem with the inchikey file")
            EX['inchi_key_molconvert'] = inchi_key_molconvert[9:]
            EX['inchi_key_molconvert_neutral'] = inchi_key_molconvert_neutral[
                9:]

            # Adding the molconvert inchi-key to the database
            WhereString = f"{id_column} = '{structure_id}'"
            Sql.UpdateTable(cursor, table_name, EX, WhereString)
    return None
Beispiel #4
0
def Retrieve_NP_IDs(sqlite_file, table_name, id_column="structure_id"):
    """Retrieves all NP_IDs from the SQlite database and adds them to a list

    Keyword Arguments:
        sqlite_file -- Path to the SQlite database
        table_name -- Name of the table in the database to edit
        id_column -- Name of the column that contains the IDs we want
    Returns:
        NPDB_ID_List -- List with NP_IDs
    """
    # Connecting to the database file
    conn, c = Sql.Connect(sqlite_file)

    # Adding the NP_IDs to a list
    NPDB_ID_List = []
    for ID in c.execute(f'SELECT {id_column} from {table_name}'):
        NPDB_ID_List.append(ID)

    # Committing changes and closing the connection to the database file
    Sql.Close(conn)
    return NPDB_ID_List
Beispiel #5
0
def main(sqlite_file,
         table_name,
         IDcolumn='column_id',
         CompoundDict=None,
         FailMax=10):
    """Creates a new table in a SQlite file to add the MIBiG data to.
    Keyword Argument:
        SqliteFile -- Path of the SQlite database to add the MIBiG data to
        MIBiGtableName -- Name of the to be created or edited table
        CompoundDict -- Dictionary with compound ids+names as keys and
                        smiles as values.
        FailMax -- How many structures in a row should be missing before it
                   assumes it is at the end of the BGCs
    """
    StartTime = time.time()
    id_column = IDcolumn  # name of the column with the primary key
    # names of the new columns
    column_names = ['compound_name','biosyn_class','biosyn_subclass',\
    'chem_synonyms','chem_target','molecular_formula','mol_mass',\
    'chem_struct','pubchem_id','chemspider_id','chebi_id','chembl_id',\
    'chem_act','other_chem_act','loci','publications','rdkit_smile_1',\
    'rdkit_smile_2','rdkit_smile_3','rdkit_smile_4','rdkit_inchi_key',\
    'rdkit_inchi_key1']
    column_type = 'TEXT'

    # Connecting to the database file
    conn = sqlite3.connect(sqlite_file)  # Connecting to the database
    c = conn.cursor()  # Adding a cursor to interact with the database

    # Adding new table or new columns, if they do not exist yet,
    # without a row value.
    InteractWithSQL.CreateNewTable(c, table_name, id_column, column_names,
                                   column_type)

    #Adding the usefull MIBiG data to the specified SQlite Database
    # in a (new) table named 'mibig'
    BGCnr = 1  # initiate counter for checking if BGCs exits.
    FailCounter = 0  # counter for how many BGCs in a row are missing
    FailedBGCs = []  # initalising a list to store BGC that have failed
    QueryDict = {}  # initialise a dictionary with which the SQL filling
    # command will be constructed

    while FailCounter < FailMax:  # while not at the end of the database
        try:
            # Create the BGC id which always has 7 numbers
            BGC = "BGC" + str(BGCnr).zfill(7)
            url = "https://mibig.secondarymetabolites.org/repository/" + BGC + "/" + BGC + ".json"
            r = requests.get(url)  # Load the json of the current BGC
            JsonDict = r.json()  # Translate the json to a dictionary.
            CompoundInJson = 0  # Counter to find out how many compounds are in a BGC
            # For each compound in each json/dict
            for compound in JsonDict['general_params']['compounds']:
                # If one of the jsons is formatted incorrectly or is missing
                # a crucial value we do not want the whole script to error.
                try:
                    GP = JsonDict['general_params']
                    MIBIGaccession = GP['mibig_accession']
                    CompoundName = compound.get('compound', 'NA')
                    try:
                        if CompoundDict == None:
                            ChemStruct = compound.get('chem_struct', 'NA')
                        else:
                            ChemStruct = CompoundDict[MIBIGaccession + '_' +
                                                      CompoundName]
                    except KeyError as e:
                        ChemStruct = 'NA'
                except Exception as e:
                    print("Error occured at the first step of:")
                    print(BGC)
                    #print(EX)
                    print(e)

                # Split up because it simplified the changing of ChemStruct
                try:
                    QueryDict['compound_id'] = MIBIGaccession + "_" + str(
                        CompoundInJson)
                    QueryDict['compound_name'] = CompoundName
                    QueryDict['biosyn_class'] = JoinList(
                        GP['biosyn_class'])  #List
                    QueryDict['biosyn_subclass'] = JoinList(
                        GetSubclass(GP))  #List
                    QueryDict['chem_synonyms'] = JoinList(
                        compound.get('chem_synonyms', 'NA'))  #List
                    QueryDict['chem_target'] = compound.get(
                        'chem_target', 'NA')
                    QueryDict['molecular_formula'] = compound.get(
                        'molecular_formula', 'NA')
                    QueryDict['mol_mass'] = compound.get('mol_mass', 'NA')
                    QueryDict['chem_struct'] = ChemStruct
                    QueryDict['pubchem_id'] = compound.get('pubchem_id', "NA")
                    QueryDict['chemspider_id'] = compound.get(
                        'chemspider_id', "NA")
                    QueryDict['chebi_id'] = compound.get('chebi_id', "NA")
                    QueryDict['chembl_id'] = compound.get('chembl_id', "NA")
                    QueryDict['chem_act'] = JoinList(
                        compound.get('chem_act', 'NA'))  #List
                    QueryDict['other_chem_act'] = compound.get(
                        'other_chem_act', 'NA')
                    QueryDict['loci'] = GP['loci'].get('complete', "NA")
                    QueryDict['publications'] = JoinList(
                        GP.get('publications', 'NA'))
                    QueryDict['rdkit_smile_1'] = CreateRDKitSmile(
                        ChemStruct, False, False, False)
                    QueryDict['rdkit_smile_2'] = CreateRDKitSmile(
                        ChemStruct, True, False, False)
                    QueryDict['rdkit_smile_3'] = CreateRDKitSmile(
                        ChemStruct, False, True, False)
                    QueryDict['rdkit_smile_4'] = CreateRDKitSmile(
                        ChemStruct, True, True, False)
                    QueryDict['rdkit_inchi_key'] = CreateRDKitInchiKey(
                        QueryDict['rdkit_smile_1'])
                    QueryDict['rdkit_inchi_key1'] = QueryDict[
                        'rdkit_inchi_key'][0:14]

                    QueryDict = {
                        k: "NA" if v == "" else v
                        for k, v in QueryDict.items()
                    }
                    FailCounter = 0
                # print which compound was wrong
                except Exception as e:
                    print("Error occured at the second step of:")
                    print(BGC)
                    print(e)
                InteractWithSQL.InsertOrUpdate(c, table_name, QueryDict)
                CompoundInJson += 1
        except json.JSONDecodeError:
            FailCounter += 1
            FailedBGCs.append(BGC)
        if FailCounter == FailMax:
            print("Stopped at: " + BGC + " because the last " + str(FailMax) +
                  " BGCs have not been found.")
            print("You should probably check if this is past the latests BGC")
        BGCnr += 1  # go to next BGC
        if (BGCnr - 1) % 200 == 0:
            print("Arrived at " + BGC)

    # Committing changes and closing the connection to the database file
    conn.commit()
    conn.close()

    # Print which BGC have failed to load, because they either are not
    # available or an error with the page has occured.
    print("The BGC that have failed to load are:")
    print(', '.join(FailedBGCs))
    return None
Beispiel #6
0
    print("Step 1 took " + Interval)

    if 2 not in SkipSteps:
        #2 Get Inchi Keys from Sam/Rutger and input into NPDB
        print("_____Starting Step 2")
        PrintTime()
        InchiToSQL.main(cfg['InchiKeyFile'], cfg['SQLPath'], cfg['NPDBtable'],
                        cfg['structure_id'])
        Interval, start = interval(start)
        print("_____Step 2 took " + Interval)

    if 3 not in SkipSteps:
        #3 Combine two seperate inchi_keys in NPDB into one.
        print("_____Starting Step 3")
        PrintTime()
        conn, c = Sql.Connect(cfg['SQLPath'])
        InchiToSQL.CombineInchiKeys(c, cfg['NPDBtable'], cfg['structure_id'])
        Sql.Close(conn)
        Interval, start = interval(start)
        print("_____Step 3 took " + Interval)

    if 4 in SkipSteps:
        MibigCompoundDict = None
    else:
        #4 Create a dictionary with all MIBiG compounds-ids and SMIlES from the
        # file from Michelle Schorn
        print("_____Starting Step 4")
        PrintTime()
        #Adding Smiles from tsv to mibig database
        MibigCompoundDict = ClassifyMibigTsv.LoadMibigTsv(
            cfg['MibigSmilesFile'])
Beispiel #7
0
def mainMIBIG(QueryIDDict,
              SqliteFile,
              TableName,
              IDcolumn='compound_id',
              Batched=False,
              TimeStamp=000000):
    """Run Classyfire on all smiles of a column in a SQlite table

    Keyword Arguments:
        QueryIDDict -- Dictionary with 'compound_ID'_'compound_name' as key
        and a QueryID as value that can be used to retrieve ClassyFire
        classifications.
        SqliteFile -- Path to the SQlite database
        TableName -- Name of the table in the database to edit
        Batched -- Boolean, wheter to perform the classification in batches
        TimeStamp -- int/float, used to indicate when the output was created
    """
    sqlite_file = SqliteFile  # path of the sqlite database file
    table_name = TableName  # name of the table to be interacted with
    id_column = IDcolumn  # name of the PRIMARY KEY column
    compound_name_column = 'compound_name'  # name of the compound name
    default_value = 'NA'
    # name of the new classification column
    columns = ['cf_kingdom','cf_superclass',\
    'cf_class','cf_subclass','cf_intermediate_0','cf_intermediate_1',\
    'cf_intermediate_2','cf_intermediate_3','cf_intermediate_4',\
    'cf_intermediate_5','cf_molecular_framework','cf_alternative_parents',\
    'cf_substituents', 'cf_description', 'cf_queryID']

    # Connecting to the database file
    conn = sqlite3.connect(sqlite_file)  # Connecting to the database
    c = conn.cursor()  # Adding a cursor to interact with the database

    if Batched == False:
        column_type = 'TEXT'
        # Adding new column, if it does not exist yet, without a row value
        for new_column_name in columns:
            try:
                c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct} DEFAULT {dv}"\
                    .format(tn=table_name, cn=new_column_name, ct=column_type, dv=default_value))
                print("Column created: {cn}".format(cn=new_column_name))
            except sqlite3.OperationalError:
                print("Column already exists: {cn}".format(cn=new_column_name))

    FailedStructures = []
    ListUnclassified = []
    ListEmpty = []
    for key, QueryID in QueryIDDict.items():
        CompoundID, CompoundName = key.split('_')
        CompoundID = CompoundID.replace('\'', '`')
        CompoundName = CompoundName.replace('\'', '`')
        Class = GetSinglePyclassyfireResult(QueryID)
        if Class == False or Class == {}:
            WhereString = f"{id_column} LIKE '{CompoundID}%' AND {compound_name_column} == '{CompoundName}'"
            InteractWithSQL.UpdateTable(c, table_name, {'cf_queryID': QueryID},
                                        WhereString)
            InchiKeyWhereString = f"cf_queryID = '{QueryID}'"
            InchiKey = InteractWithSQL.GetFirstValue(c, table_name,
                                                     InchiKeyWhereString,
                                                     'rdkit_inchi_key')

            if InchiKey == False:
                print('No InchiKey')
                Class = False
            else:
                Class = PyClassify(InchiKey)

        if Class == False or Class == {}:
            print(key, "ClassyFire did not recoginize this structure")
            Class = {}
            ListUnclassified.append(key)
        else:
            EX = {}
            try:
                EX['cf_kingdom'] = Class.get('kingdom', 'NA').get('name', 'NA')
            except AttributeError:
                EX['cf_kingdom'] = 'NA'
            try:
                EX['cf_superclass'] = Class.get('superclass',
                                                'NA').get('name', 'NA')
            except AttributeError:
                EX['cf_superclass'] = 'NA'
            try:
                EX['cf_class'] = Class.get('class', 'NA').get('name', 'NA')
            except AttributeError:
                EX['cf_class'] = 'NA'
            try:
                EX['cf_subclass'] = Class.get('subclass',
                                              'NA').get('name', 'NA')
            except AttributeError:
                EX['cf_subclass'] = 'NA'
            try:
                for i in range(6):
                    Class['intermediate_nodes'].append({'name': 'NA'})
                in0, in1, in2, in3, in4, in5 = Class['intermediate_nodes'][:6]
                EX['cf_intermediate_0'] = in0['name']
                EX['cf_intermediate_1'] = in1['name']
                EX['cf_intermediate_2'] = in2['name']
                EX['cf_intermediate_3'] = in3['name']
                EX['cf_intermediate_4'] = in4['name']
                EX['cf_intermediate_5'] = in5['name']
            except (KeyError, AttributeError) as e:
                EX['cf_intermediate_0'] = 'NA'
                EX['cf_intermediate_1'] = 'NA'
                EX['cf_intermediate_2'] = 'NA'
                EX['cf_intermediate_3'] = 'NA'
                EX['cf_intermediate_4'] = 'NA'
                EX['cf_intermediate_5'] = 'NA'

            try:
                EX['cf_molecular_framework'] = Class.get(
                    'molecular_framework', 'NA')
            except:
                EX['cf_molecular_framework'] = 'NA'

            # Alternative Parents
            AlternativeParentsList = Class.get('alternative_parents', 'NA')
            if AlternativeParentsList == 'NA':
                EX['cf_alternative_parents'] = 'NA'
            else:
                AlternativeParentsNames = []
                for item in AlternativeParentsList:
                    AlternativeParentsNames.append(item['name'])
                AlternativeParentsString = ", ".join(AlternativeParentsNames)
                EX['cf_alternative_parents'] = AlternativeParentsString

            # Substituents
            SubstituentsList = Class.get('substituents', 'NA')
            if SubstituentsList == 'NA':
                EX['cf_substituents'] = 'NA'
            else:
                EX['cf_substituents'] = ", ".join(SubstituentsList)

            # Description
            EX['cf_description'] = Class.get('description', 'NA')
            EX['cf_queryID'] = QueryID

            #Problematic apostrophes in the text need to be removed
            for key, value in EX.items():
                if type(value) == str:
                    EX[key] = value.replace('\'', '`')
                elif type(value) == list:
                    NewList = []
                    for item in value:
                        if type(item) == str:
                            NewList.append(item.replace('\'', '`'))
                        else:
                            NewList.append(item)
                    EX[key] = NewList

            try:
                if not all(value == 'NA' for value in EX.values()):
                    WhereString = f"{id_column} LIKE '{CompoundID}%' AND {compound_name_column} == '{CompoundName}'"
                    InteractWithSQL.UpdateTable(c, table_name, EX, WhereString)
            except sqlite3.OperationalError:
                print("Syntax Error occurred at: " + str(key))
                print(sql)
                FailedStructures.append(key)

    # Committing changes and closing the connection to the database file
    conn.commit()
    conn.close()

    OutputUnclassifiedStructures(ListUnclassified, ListEmpty, FailedStructures,
                                 TimeStamp)
    return None