Beispiel #1
0
def WriteSDF(df,out,molColumn,properties=None,allNumeric=False,titleColumn=None):
  '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specific in the "properties" list.
   The "allNumeric" flag allows to automatically include all numeric columns in the output.
   "titleColumn" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
  '''
  writer = SDWriter(out)
  if properties is None:
    properties=[]
  if allNumeric:   
    properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))])
    
  if molColumn in properties:
    properties.remove(molColumn)
  if titleColumn in properties:
    properties.remove(titleColumn)
  writer.SetProps(properties)
  for row in df.iterrows():
    mol = copy.deepcopy(row[1][molColumn])
    if titleColumn is not None:
      if titleColumn == 'RowID':
        mol.SetProp('_Name',str(row[0]))
      else:
        mol.SetProp('_Name',row[1][titleColumn])
    for p in properties:
      mol.SetProp(p,str(row[1][p]))
    writer.write(mol)
  writer.close()
Beispiel #2
0
def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False):
  '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as
  SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export
  all columns.
  The "allNumeric" flag allows to automatically include all numeric columns in the output.
  User has to make sure that correct data type is assigned to column.
  "idName" can be used to select a column to serve as molecule title. It can be set to
  "RowID" to use the dataframe row key as title.
  '''
  close = None
  if isinstance(out, string_types):
    if out.lower()[-3:] == ".gz":
      import gzip
      if PY3:
        out = gzip.open(out, "wt")
      else:
        out = gzip.open(out, "wb")
      close = out.close

  writer = SDWriter(out)
  if properties is None:
    properties = []
  else:
    properties = list(properties)
  if allNumeric:
    properties.extend([
      dt for dt in df.dtypes.keys()
      if (np.issubdtype(df.dtypes[dt], float) or np.issubdtype(df.dtypes[dt], int))
    ])

  if molColName in properties:
    properties.remove(molColName)
  if idName in properties:
    properties.remove(idName)
  writer.SetProps(properties)
  for row in df.iterrows():
    # make a local copy I can modify
    mol = Chem.Mol(row[1][molColName])

    if idName is not None:
      if idName == 'RowID':
        mol.SetProp('_Name', str(row[0]))
      else:
        mol.SetProp('_Name', str(row[1][idName]))
    for p in properties:
      cell_value = row[1][p]
      # Make sure float does not get formatted in E notation
      if np.issubdtype(type(cell_value), float):
        s = '{:f}'.format(cell_value).rstrip("0")  # "f" will show 7.0 as 7.00000
        if s[-1] == ".":
          s += "0"  # put the "0" back on if it's something like "7."
        mol.SetProp(p, s)
      else:
        mol.SetProp(p, str(cell_value))
    writer.write(mol)
  writer.close()
  if close is not None:
    close()
Beispiel #3
0
def classify(sdf, label, lambdas):
    new_filename = "%s_class.sdf" % sdf.split('.sdf')[0]
    new_label = label + "_class"
    sdm = ForwardSDMolSupplier(sdf,
                               strictParsing=False,
                               removeHs=False,
                               sanitize=False)
    sdw = SDWriter(new_filename)
    counter = -1
    i = 0
    for mol in sdm:
        print(i)
        sys.stdout.flush()
        i += 1
        counter += 1
        if mol is None:
            print("%d rdkit couldn't read molecule" % counter, file=sys.stderr)
            sys.stderr.flush()
            continue
        c = None
        prop = floatify(mol.GetProp(label))
        if prop is None:
            print("couldn't convert %s to float or int...skip" %
                  mol.GetProp(label),
                  file=sys.stderr)
            sys.stderr.flush()
            continue
        for k, l in lambdas.items():
            if l(prop):
                c = k
                print("hit %s" % k)
                sys.stdout.flush()
                break
        if c is None:
            print("%d no prop range matched '%s' ..skip" %
                  (counter, mol.GetProp(label)),
                  prop,
                  type(prop),
                  file=sys.stderr)
            sys.stderr.flush()
            sys.stdout.flush()
            continue
        mol.SetProp(new_label, c)
        try:
            sdw.write(mol)
        except:
            print(
                "couldn't write mol %d to file, try to build mol from smiles" %
                i,
                file=sys.stderr)
            mol = MolFromSmiles(mol.GetProp("SMILES"))
            AllChem.Compute2DCoords(mol)
            mol.SetProp(new_label, c)
            try:
                sdw.write(mol)
            except:
                print("couldn't write mol %d to file...skip" % i,
                      file=sys.stderr)
    sdw.close()
Beispiel #4
0
    def __init__(self, args):

        self.args = args
        self.q_input = args.input_query
        self.g_input = open(args.input_graph, 'r')

        if args.sdf:
            rdlogger.setLevel(4)
            self.output = SDWriter(args.output)
        else:
            self.output = open(args.output, 'w')

        self.query = set()
        self.matching_parents = set()
        self.count = 0
Beispiel #5
0
def min_sdf():
    files = glob("raw/openchem_logP_confs/*.sdf")
    for f in tqdm(files):
        try:
            suppl = SDMolSupplier(f, removeHs=False)
            lowest_e = np.inf
            selected_mol = None
            for mol in suppl:
                energy = float(mol.GetProp("energy_abs"))
                if energy < lowest_e:
                    lowest_e = energy
                    selected_mol = mol
            if selected_mol is not None:
                writer = SDWriter(f"raw/openchem_logP_mmff_sdfs/{osp.basename(f).split('.')[0].split('_')[0]}.mmff.sdf")
                writer.write(selected_mol)
        except Exception as e:
            print(e)
Beispiel #6
0
def WriteSDF(df,
             out,
             molColName='ROMol',
             idName=None,
             properties=None,
             allNumeric=False):
    '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns.
  The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column.
  "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
  '''

    close = None
    if isinstance(out, string_types):
        if out.lower()[-3:] == ".gz":
            import gzip
            out = gzip.open(out, "wb")
            close = out.close

    writer = SDWriter(out)
    if properties is None:
        properties = []
    else:
        properties = list(properties)
    if allNumeric:
        properties.extend([
            dt for dt in df.dtypes.keys()
            if (np.issubdtype(df.dtypes[dt], float)
                or np.issubdtype(df.dtypes[dt], int))
        ])

    if molColName in properties:
        properties.remove(molColName)
    if idName in properties:
        properties.remove(idName)
    writer.SetProps(properties)
    for row in df.iterrows():
        # make a local copy I can modify
        mol = Chem.Mol(row[1][molColName])

        if idName is not None:
            if idName == 'RowID':
                mol.SetProp('_Name', str(row[0]))
            else:
                mol.SetProp('_Name', str(row[1][idName]))
        for p in properties:
            cell_value = row[1][p]
            # Make sure float does not get formatted in E notation
            if np.issubdtype(type(cell_value), float):
                s = '{:f}'.format(cell_value).rstrip(
                    "0")  # "f" will show 7.0 as 7.00000
                if s[-1] == ".":
                    s += "0"  # put the "0" back on if it's something like "7."
                mol.SetProp(p, s)
            else:
                mol.SetProp(p, str(cell_value))
        writer.write(mol)
    writer.close()
    if close is not None:
        close()
Beispiel #7
0
def save_sdf(mols,
             predictions,
             filename,
             id_col=None,
             write_all=False,
             prediction_col='Prediction'):
    sdw = SDWriter(filename)
    props = list(
        functools.reduce(operator.or_,
                         map(lambda x: set(x.GetPropNames()), mols)))
    prediction_items = [prediction_col]

    if id_col is not None and id_col in props:
        props.remove(id_col)
    else:
        id_col = None

    for i, mol in enumerate(mols):
        if not write_all:
            for prop in props:
                mol.ClearProp(prop)
        for prediction_item in prediction_items:
            mol.SetIntProp(prediction_item, int(predictions[i]))
        sdw.write(mol)
    sdw.close()
    pass
Beispiel #8
0
def write_sdf_file(scaffold_graph, output_file):
    """Write an SDF file from a scaffoldgraph

    Parameters
    ----------
    scaffold_graph (sg.ScaffoldGraph): graph to be converted
    output_file (str): path to output file
    """

    N = scaffold_graph.num_scaffold_nodes
    sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True),
                              key=lambda x: x[1]['hierarchy'])
    mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N)))
    writer = SDWriter(output_file)
    for scaffold, data in sorted_scaffolds:
        molecule = MolFromSmiles(scaffold)
        if molecule is not None:
            subscaffolds = list(scaffold_graph.predecessors(scaffold))
            molecule.SetProp('_Name', mapping[scaffold])
            molecule.SetIntProp('HIERARCHY',
                                scaffold_graph.nodes[scaffold]['HIERARCHY'])
            molecule.SetProp('SMILES', scaffold)
            molecule.SetProp(
                'SUBSCAFFOLDS',
                ', '.join([str(mapping[s]) for s in subscaffolds]))
            writer.write(molecule)
    writer.close()
Beispiel #9
0
    def __init__(self, args):

        self.args = args
        self.inputs = args.input

        if args.sdf:
            rdlogger.setLevel(4)
            self.output = SDWriter(args.output)
        else:
            self.output = open(args.output, 'w')

        self.mol_map = open(args.map_mols, 'w') if args.map_mols else None
        if self.mol_map:
            self.mol_map.write('MOLECULE_ID\tSCAFFOLD_ID\n')

        self.ann_map = open(args.map_annotations, 'w') if args.map_annotations else None
        if self.ann_map:
            self.ann_map.write('SCAFFOLD_ID\tANNOTATIONS\n')

        self.current_id = 0
        self.duplicates = 0
        self.table = {}
Beispiel #10
0
    def writeSdf(self, sdf_name, fields=None):
        """
        Writes an sdf file with molecules stored. Is it possible also to manage which field will be written

        Parameters
        ----------
        sdf_name: str
            The ouput sdf filename
        fields: list
            A list of the fields to write. If None all are saved
        """

        from rdkit.Chem import SDWriter

        writer = SDWriter(sdf_name)
        if fields is not None:
            if not isinstance(fields, list):
                raise TypeError(
                    f"The fields argument {type(fields)} should be a list")
            writer.SetProps(fields)

        for m in self._mols:
            writer.write(m._mol)
Beispiel #11
0
def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False):
  '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns. 
  The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column.
  "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
  '''
  writer = SDWriter(out)
  if properties is None:
    properties=[]
  if allNumeric:   
    properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))])
  
  if molColName in properties:
    properties.remove(molColName)
  if idName in properties:
    properties.remove(idName)
  writer.SetProps(properties)
  for row in df.iterrows():
    mol = copy.deepcopy(row[1][molColName])
    # Remove embeded props
    for prop in mol.GetPropNames():
      mol.ClearProp(prop)
    
    if idName is not None:
      if idName == 'RowID':
        mol.SetProp('_Name',str(row[0]))
      else:
        mol.SetProp('_Name',str(row[1][idName]))
    for p in properties:
      cell_value = row[1][p]
      # Make sure float does not get formatted in E notation
      if np.issubdtype(type(cell_value),float):
        mol.SetProp(p,'{:f}'.format(cell_value).rstrip('0'))
      else:
        mol.SetProp(p,str(cell_value))
    writer.write(mol)
  writer.close()
Beispiel #12
0
def preprocess_mols(mols, session_id):
    
    session_dir = join('uploads', session_id)
    mols = np.array(mols)
    df = pd.DataFrame([m.GetPropsAsDict() for m in mols])
    df['NN'] = np.nan
    
    exp_cols = [c for c in df.columns if 'experimental' in c]
    # if any experiemntal value is known for a given molecule, the molecule is 
    # assumed to be known
    experiemntal_mask = np.any(~pd.isna(df[exp_cols]), 1)
    
    # TO DO - check if there is no issues with preserving order
    test_mols = mols[~experiemntal_mask]
    test_mols_ids = df.index[~experiemntal_mask]
    known_mols = mols[experiemntal_mask]
    known_mols_ids = df.index[experiemntal_mask]
        
    # get indices of NNs for molecules that don't have experiemntal data
    # ie. that were not previously tested molecules
    if len(test_mols):
        test_nns_idx, similarity = get_Tanimoto_NNs(test_mols, known_mols, 3, 
                                                    nns=50, return_sim=True)
        test_nns_ids = test_mols_ids[test_nns_idx]
        formatted = list(map(repr, test_nns_ids.tolist()))
        df.at[~experiemntal_mask.values, 'NN'] = formatted
        df.at[~experiemntal_mask.values, 'Similarity_Tanimoto'] = similarity[:,0]
    
    # get indices of NNs for molecules that have experiemntal data
    if len(known_mols):
        known_nns_idx, similarity = get_Tanimoto_NNs(known_mols, known_mols, 3, 
                                                     order=1, nns=50,
                                                     return_sim=True)
        known_nns_ids = known_mols_ids[known_nns_idx]
        formatted = list(map(repr, known_nns_ids.tolist()))
        df.at[experiemntal_mask.values, 'NN'] = formatted
        df.at[experiemntal_mask.values, 'Similarity_Tanimoto'] = similarity[:,0]
    
    # Save molecules as dataset
    ## Check if the dir already exists
    if exists(session_dir):
        raise RuntimeError('The session directory %s already exists!'%session_dir)
    else:
        makedirs(session_dir)
    
    ## Write id of Nearest Neighbour to SDF properties and save each mol to 
    ## separate file, the filenames are equal to index in dataset
    for idx, mol in zip(df.index, mols):
       writer = SDWriter(join(session_dir, '%d.sdf'%idx))
       mol.SetProp('NN', '%s'%df.loc[idx]['NN'])
       mol.SetProp('Similarity_Tanimoto', '%s'%df.loc[idx]['Similarity_Tanimoto'])
       writer.write(mol)
       writer.close()
       
    return session_dir
Beispiel #13
0
def WriteSDF(df,
             out,
             molColName='ROMol',
             idName=None,
             properties=None,
             allNumeric=False):
    '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns.
  The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column.
  "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
  '''
    writer = SDWriter(out)
    if properties is None:
        properties = []
    if allNumeric:
        properties.extend([
            dt for dt in df.dtypes.keys()
            if (np.issubdtype(df.dtypes[dt], float)
                or np.issubdtype(df.dtypes[dt], int))
        ])

    if molColName in properties:
        properties.remove(molColName)
    if idName in properties:
        properties.remove(idName)
    writer.SetProps(properties)
    for row in df.iterrows():
        mol = copy.deepcopy(row[1][molColName])
        # Remove embeded props
        for prop in mol.GetPropNames():
            mol.ClearProp(prop)

        if idName is not None:
            if idName == 'RowID':
                mol.SetProp('_Name', str(row[0]))
            else:
                mol.SetProp('_Name', str(row[1][idName]))
        for p in properties:
            cell_value = row[1][p]
            # Make sure float does not get formatted in E notation
            if np.issubdtype(type(cell_value), float):
                mol.SetProp(p, '{:f}'.format(cell_value).rstrip('0'))
            else:
                mol.SetProp(p, str(cell_value))
        writer.write(mol)
    writer.close()
Beispiel #14
0
def csv_to_sdf(csv_file, sdf_file, smiles_col, class_col, delim=','):
    sdw = SDWriter(sdf_file)

    with open(csv_file) as fh:
        for i, line in enumerate(fh.readlines()):
            if i == 0:
                continue
            line_split = line.strip().split(delim)
            smiles = line_split[smiles_col].replace('"', '')
            act_class = line_split[class_col].replace('"', '')
            act_newLabel = activity_label_to_id_map[act_class]
            mol = MolFromSmiles(smiles)
            mol.SetProp("TL", act_newLabel)
            sdw.write(mol)
    sdw.close()
Beispiel #15
0
def WriteSDF(df,
             out,
             molColumn,
             properties=None,
             allNumeric=False,
             titleColumn=None):
    '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specific in the "properties" list.
   The "allNumeric" flag allows to automatically include all numeric columns in the output.
   "titleColumn" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
  '''
    writer = SDWriter(out)
    if properties is None:
        properties = []
    if allNumeric:
        properties.extend([
            dt for dt in df.dtypes.keys()
            if (np.issubdtype(df.dtypes[dt], float)
                or np.issubdtype(df.dtypes[dt], int))
        ])

    if molColumn in properties:
        properties.remove(molColumn)
    if titleColumn in properties:
        properties.remove(titleColumn)
    writer.SetProps(properties)
    for row in df.iterrows():
        mol = copy.deepcopy(row[1][molColumn])
        if titleColumn is not None:
            if titleColumn == 'RowID':
                mol.SetProp('_Name', str(row[0]))
            else:
                mol.SetProp('_Name', row[1][titleColumn])
        for p in properties:
            mol.SetProp(p, str(row[1][p]))
        writer.write(mol)
    writer.close()
Beispiel #16
0
def write_sdf_file(scaffold_graph, output_file):
    """Write an SDF file from a ScaffoldGraph.

    All scaffolds in the scaffoldgraph are written to the
    SDF, while molecules are ignored. Scaffolds are sorted
    in ascending order according to their hierarchy level.

    The output follows the standard SDF specification with
    the added property fields:

        TITLE field: scaffold ID
        SUBSCAFFOLDS field: list of sub-scaffold IDs
        HIERARCHY field: hierarchy level of scaffold
        SMILES field: scaffold canonical SMILES

    Parameters
    ----------
    scaffold_graph : scaffoldgraph.core.ScaffoldGraph
        ScaffoldGraph to be written to an SDF.
    output_file : str
        Filepath to an output file.

    """
    N = scaffold_graph.num_scaffold_nodes
    sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True),
                              key=lambda x: x[1]['hierarchy'])
    mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N)))
    writer = SDWriter(output_file)
    for scaffold, data in sorted_scaffolds:
        molecule = MolFromSmiles(scaffold)
        if molecule is not None:
            subscaffolds = list(scaffold_graph.predecessors(scaffold))
            molecule.SetProp('_Name', mapping[scaffold])
            molecule.SetIntProp('HIERARCHY',
                                scaffold_graph.nodes[scaffold]['HIERARCHY'])
            molecule.SetProp('SMILES', scaffold)
            molecule.SetProp(
                'SUBSCAFFOLDS',
                ', '.join([str(mapping[s]) for s in subscaffolds]))
            writer.write(molecule)
    writer.close()
Beispiel #17
0
def mols_to_sdf(mols: List[Mol], path: str) -> Optional[str]:
    """
    Writes all molecules from `mols` to an SDF with the given path.

    Parameters
    ----------
    mols : List[Mol]
        List of RDKit mol objects to write into a SDF
    path : str
        The path, where the SDF should be written to

    Returns
    -------
    Optional[str]
        None, if all went fine. A string containing an error message otherwise.
    """

    try:
        sdw = SDWriter(path)
        for mol in mols:
            sdw.write(mol)
        sdw.close()
    except OSError:
        return f'Could not create output file: {abspath(path)}'
Beispiel #18
0
def _getSDFStream(f, mols):
    w = SDWriter(f)
    for m in mols:
        w.write(m)
    w.flush()
Beispiel #19
0
def split(sdf, label_col, folder, splitfold=5):
    """
    Stratified splitting of dataset into k-folds
    :param mols: Input molecules as dataset
    :param label_col: Column name of labels for stratification
    :param folder: Folder/model name
    :param splitfold: k number of folds
    :return:
    """

    if folder is None:
        sdf_path = pathlib.Path(sdf)
        sdf_name = sdf_path.name.partition('.')[0]

        folder = sdf_path.parent.joinpath(sdf_name)
        if not folder.is_dir():
            folder.mkdir()
        folder = folder.absolute()

    else:
        p = pathlib.Path(folder)
        if not p.is_dir():
            p.mkdir()

    train_files = []
    test_files = []

    sdm = SDMolSupplier(sdf)
    mols = [x for x in sdm]

    labels = []
    for i in range(len(mols)):
        labels.append(mols[i].GetProp(label_col))

    skf = StratifiedKFold(n_splits=splitfold)
    fold = 0
    for train_ix, test_ix in skf.split(mols, labels):
        test_set_fn = "{}/testset_{}.sdf".format(folder, fold)
        train_set_fn = "{}/trainset_{}.sdf".format(folder, fold)

        sdw_train = SDWriter(train_set_fn)
        for i in train_ix:
            sdw_train.write(mols[i])
        sdw_train.close()
        train_files.append(train_set_fn)


        sdw_test = SDWriter(test_set_fn)
        for i in test_ix:
            sdw_test.write(mols[i])
        sdw_test.close()
        test_files.append(test_set_fn)
        fold += 1

    return {'train_files': train_files,
            'test_files': test_files}, folder
Beispiel #20
0
def _getSDFStream(f, mols):
    w = SDWriter(f)
    for m in mols:
        w.write(m)
    w.flush()
Beispiel #21
0
    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return
Beispiel #22
0
    ## Make directories for each transformation ##
        for pair in map_list:
            print('%s -> %s \n' % (pair[0], pair[1]))

            pair_dir = pair[0] + '_' + pair[1]
            if not os.path.isdir(pair_dir):
                os.mkdir(pair_dir)
                os.chdir(pair_dir)

                os.mkdir(dir_1_name)
                os.mkdir(dir_2_name)

                ## Write start ligand file, parameters ##
                os.chdir(dir_1_name)
                writer = SDWriter('for_parm.sdf')
                if ligands_name.count(pair[0]) > 0:
                    writer.write(ligands[ligands_name.index(pair[0])])
                    writer.flush()
                else:
                    print('Error: cannot map ligand %s.\n' % (pair[0]))
                    sys.exit()
                run_antechamber('for_parm.sdf',
                                'UNL',
                                ff,
                                int(
                                    rdmolops.GetFormalCharge(
                                        ligands[ligands_name.index(pair[0])])),
                                clean_sdf=True)

                # setup Molecule_ff
def write_sdf(mol, path):
    writer = SDWriter(path)
    writer.write(mol)
    writer.close()
Beispiel #24
0
"""
This script combines multiple SDF files specified via commandline arguments
to one SDF file. It will be saved as "combined_training_datasets.sdf". The
script tries to preserve the information about the original dataset name by
splitting the file name at "_" and saving the first element of this split as
SDF file tag named "original_dataset".
"""

from sys import argv

from rdkit.Chem import SDWriter, SDMolSupplier

__author__ = 'Marcel Baltruschat'
__copyright__ = 'Copyright © 2020'
__license__ = 'MIT'
__version__ = '1.0.0'

sdw = SDWriter('combined_training_datasets.sdf')
for f in argv[1:]:
    dsname = f.split('_')[0]
    sdm = SDMolSupplier(f)
    for mol in sdm:
        mol.SetProp('original_dataset', dsname)
        sdw.write(mol)
sdw.close()
Beispiel #25
0
    def process(
            self,
            input_file: str,
            output_file: str = "",
            output_file_sdf: str = "",
            sdf_append: bool = False,
            #images_prefix: str = "",
            format_output: bool = True,
            write_header: bool = True,
            osra_output_format: str = "",
            output_formats: list = None,
            dry_run: bool = False,
            csv_delimiter: str = ";",
            use_gm: bool = True,
            gm_dpi: int = 300,
            gm_trim: bool = True,
            n_jobs: int = -1,
            input_type: str = "",
            standardize_mols: bool = True,
            annotate: bool = True,
            chemspider_token: str = "",
            custom_page: int = 0,
            continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OSRA.

        Parameters
        ----------
        input_file : str
            Path to file to be processed by OSRA.
        output_file : str
            File to write output in.
        output_file_sdf : str
            | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output.
            | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        NOT IMPLEMENTED | images_prefix : str
            Prefix for images of extracted compounds which will be written.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        osra_output_format : str
            | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "smi", "can", "sdf"
            | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet).
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA.
            | Default value: ["smiles"]

            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      Value      |    Source    |                                            Note                                            |
            +=================+==============+============================================================================================+
            |      smiles     |     RDKit    |                                          canonical                                         |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |   smiles_osra   | OSRA ("smi") |                                           SMILES                                           |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            | smiles_can_osra | OSRA ("can") |                                      canonical SMILES                                      |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      inchi      |     RDKit    | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     inchikey    |     RDKit    |                              The same applies as for "inchi".                              |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |       sdf       |     RDKit    |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     sdf_osra    | OSRA ("sdf") |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+

        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        use_gm : bool
            | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing.
            | If False, OSRA will use it's own conversion of PDF to image.
            | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information
              when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes
              incorrectly recognised structures.
        gm_dpi : int
            How many DPI will temporary PNG images have.
        gm_trim : bool
            If True, gm will trim the temporary PNG images.
        n_jobs : int
            | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images.
            | If -1 all CPUs are used.
            | If 1 is given, no parallel computing code is used at all, which is useful for debugging.
            | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf" or "image" and magic bytes check will be skipped.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for SMILES, InChI etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API. Make account there to obtain it.
        custom_page : bool
            When `use_gm` is False, this will set the page for all extracted compounds.
        continue_on_failure : bool
            | If True, continue running even if OSRA returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OSRA
            - stderr: str ... standard error output from OSRA
            - exit_code: int ... exit code from OSRA
            - content:

                - list of OrderedDicts ... when `format_output` is True.
                - None ... when `format_output` is False

            | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved.
            | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image
              extracted by OSRA.

        Notes
        -----
        Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise
        you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set).
        """

        options_internal = self.options_internal.copy()
        osra_smiles_outputs = ["smi", "can"]

        # OSRA output format check
        if osra_output_format:
            options_internal["output_format"] = osra_output_format
        else:
            osra_output_format = options_internal["output_format"]

        osra_valid_output_formats = {
            "can": "smiles_can_osra",
            "smi": "smiles_osra",
            "sdf": "sdf_osra"
        }
        if osra_output_format not in osra_valid_output_formats:
            raise ValueError(
                "Unknown OSRA output format. Possible values: {}".format(
                    osra_valid_output_formats.values()))

        if osra_output_format == "sdf":
            self.logger.warning(
                "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved."
            )

        # output formats check
        is_output_sdf = False
        is_output_sdf_osra = False
        if not output_formats:
            output_formats = ["smiles"]
        else:
            output_formats = sorted(list(set(output_formats)))
            possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"]
            output_formats = [
                x for x in output_formats if x in possible_output_formats
                or x == osra_valid_output_formats[osra_output_format]
            ]

            if ("sdf" in output_formats
                    or "sdf_osra" in output_formats) and not output_file_sdf:
                self.logger.warning(
                    "Cannot write SDF output: 'output_file_sdf' is not set.")
            if output_file_sdf:
                is_output_sdf = True
            if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf:
                is_output_sdf_osra = True
            if ("smiles_osra" in output_formats or "smiles_can_osra"
                    in output_formats) and osra_output_format == "sdf":
                try:
                    output_formats.remove("smiles_osra")
                except ValueError:
                    pass
                try:
                    output_formats.remove("smiles_can_osra")
                except ValueError:
                    pass
                self.logger.warning(
                    "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"."
                    .format(osra_output_format))

        # input file type check
        possible_input_types = ["pdf", "image"]
        if not input_type:
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                use_gm = False
                self.logger.warning(
                    "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)."
                    .format(input_type, possible_input_types))
        elif input_type not in possible_input_types:
            raise ValueError("Possible 'input_type' values are {}".format(
                possible_input_types))

        #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v},
        #                   options_internal)

        if annotate:
            if not chemspider_token:
                self.logger.warning(
                    "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty."
                )
            [
                output_formats.append(x)
                for x in ["smiles", "inchi", "inchikey"]
                if x not in output_formats
            ]
            output_formats = sorted(output_formats)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)
        commands.extend(
            ["--bond", "--coordinates", "--page", "--guess", "--print"])

        if dry_run:
            return " ".join(commands)

        osra_output_list = []
        if input_type == "image" or not use_gm:
            osra_output_list.append(
                self._process(input_file,
                              commands,
                              page=custom_page if custom_page else 1))
        elif input_type == "pdf":
            with tempfile.TemporaryDirectory() as temp_dir:
                stdout, stderr, exit_code = pdf_to_images(input_file,
                                                          temp_dir,
                                                          dpi=gm_dpi,
                                                          trim=gm_trim)
                osra_output_list = Parallel(n_jobs=n_jobs)(
                    delayed(self._process)(
                        temp_image_file, commands, page=page)
                    for temp_image_file, page in get_temp_images(temp_dir))

        # summarize OSRA results
        to_return = {
            "stdout": [],
            "stderr": [],
            "exit_code": [],
            "content": None,
            "pages": []
        }
        for result in osra_output_list:
            if result["stdout"]:
                to_return["stdout"].append(result["stdout"])
                to_return["stderr"].append(result["stderr"])
                to_return["exit_code"].append(result["exit_code"])
                to_return["pages"].append(result["page"])

        if not continue_on_failure:
            errors = [(page + 1, error)
                      for page, (exit_code, error) in enumerate(
                          zip(to_return["exit_code"], to_return["stderr"]))
                      if exit_code > 0]
            if errors:
                self.logger.warning("OSRA errors:")
                for page, error in errors:
                    eprint("\tError on page {}:".format(page))
                    eprint("\n\t\t".join("\n{}".format(error).splitlines()))
                return to_return

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write("\n".join(to_return["stdout"]))
            return to_return

        output_cols = OrderedDict([("bond_length", 1), ("resolution", 2),
                                   ("confidence", 3), ("page", 4),
                                   ("coordinates", 5)])

        if osra_output_format in osra_smiles_outputs:
            compound_template_dict = OrderedDict.fromkeys(
                output_formats + list(output_cols.keys()))
        else:
            compound_template_dict = OrderedDict.fromkeys(["page"] +
                                                          output_formats)

        if any(to_return["stdout"]):
            if standardize_mols:
                standardizer = Standardizer()

            compounds = []

            if is_output_sdf:
                if sdf_append:
                    if not os.path.isfile(output_file_sdf):
                        open(output_file_sdf, mode="w",
                             encoding="utf-8").close()
                    writer = SDWriter(
                        open(output_file_sdf, mode="a", encoding="utf-8"))
                else:
                    writer = SDWriter(output_file_sdf)

            for output, page in zip(to_return["stdout"], to_return["pages"]):
                if osra_output_format in osra_smiles_outputs:
                    lines = [x.strip() for x in output.split("\n") if x]
                else:
                    lines = [x for x in output.split("$$$$") if x.strip()]

                for line in lines:
                    """
                    # so much problems with --learn
                    # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1"
                    if "learn" in filtered_cols:
                        learn_start = filtered_cols.index("learn") + 1 #  "smiles" col isn't in output_cols
                        learn_end = filtered_cols.index("learn") + 1 + 3
                        line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])]
                    """

                    if not line:
                        continue

                    if osra_output_format in osra_smiles_outputs:
                        line = [x.strip() for x in line.split()]
                        if custom_page:
                            line[output_cols["page"]] = custom_page
                        elif use_gm:
                            line[output_cols["page"]] = page
                        mol = MolFromSmiles(
                            line[0],
                            sanitize=False if standardize_mols else True)
                    elif osra_output_format == "sdf":
                        line = "\n" + line.strip()
                        mol = MolFromMolBlock(
                            line,
                            strictParsing=False,
                            sanitize=False if standardize_mols else True,
                            removeHs=False if standardize_mols else True)

                    if mol:
                        compound = compound_template_dict.copy()

                        if standardize_mols:
                            try:
                                mol = standardizer.standardize(mol)
                            except ValueError as e:
                                self.logger.warning(
                                    "Cannot standardize '{}': {}".format(
                                        MolToSmiles(mol), str(e)))

                        for f in output_formats:
                            if f == "smiles":
                                compound["smiles"] = MolToSmiles(
                                    mol, isomericSmiles=True)
                            elif f == "smiles_osra" and osra_output_format == "smi":
                                compound["smiles_osra"] = line[0]
                            elif f == "smiles_can_osra" and osra_output_format == "can":
                                compound["smiles_can_osra"] = line[0]
                            elif f == "inchi":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchi"] = inchi
                                else:
                                    compound["inchi"] = ""
                                    self.logger.warning(
                                        "Cannot convert to InChI: {}".format(
                                            MolToSmiles(mol)))
                            elif f == "inchikey":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchikey"] = InchiToInchiKey(
                                        inchi)
                                else:
                                    compound["inchikey"] = ""
                                    self.logger.warning(
                                        "Cannot create InChI-key from InChI: {}"
                                        .format(MolToSmiles(mol)))
                            elif f == "sdf":
                                compound["sdf"] = MolToMolBlock(
                                    mol, includeStereo=True)
                            elif f == "sdf_osra":
                                compound["sdf_osra"] = line

                        if is_output_sdf:
                            writer.write(mol)

                        if osra_output_format in osra_smiles_outputs:
                            compound.update([(x[0], x[1]) for x in zip(
                                list(output_cols.keys()), line[1:])])
                        else:
                            compound[
                                "page"] = page if use_gm else custom_page if custom_page else 1

                        compounds.append(compound)
                    else:
                        self.logger.warning("Cannot convert to RDKit mol: " +
                                            line[0])

            if is_output_sdf_osra:
                with open(output_file_sdf + "-osra.sdf",
                          mode="w",
                          encoding="utf-8") as f:
                    f.write("".join(to_return["stdout"]))

            to_return["content"] = sorted(compounds, key=lambda x: x["page"])

            if annotate:
                chemspider = ChemSpider(
                    chemspider_token) if chemspider_token else None

                for i, ent in enumerate(to_return["content"]):
                    self.logger.info("Annotating entity {}/{}...".format(
                        i + 1, len(to_return["content"])))
                    ent.update(
                        OrderedDict([("pch_cids_by_inchikey", ""),
                                     ("chs_cids_by_inchikey", ""),
                                     ("pch_cids_by_smiles", ""),
                                     ("chs_cids_by_smiles", ""),
                                     ("pch_cids_by_inchi", ""),
                                     ("chs_cids_by_inchi", ""),
                                     ("pch_iupac_name", ""),
                                     ("chs_common_name", ""),
                                     ("pch_synonyms", "")]))

                    results = []

                    # prefer InChI key
                    if "inchikey" in ent and ent["inchikey"]:
                        try:
                            results = get_compounds(ent["inchikey"],
                                                    "inchikey")
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    synonyms = result.synonyms
                                    if synonyms:
                                        ent["pch_synonyms"] = "\"{}\"".format(
                                            "\",\"".join(synonyms))
                                    ent["pch_iupac_name"] = result.iupac_name
                                ent["pch_cids_by_inchikey"] = "\"{}\"".format(
                                    ",".join([str(c.cid) for c in results]))
                        except (BadRequestError, NotFoundError,
                                PubChemHTTPError, ResponseParseError,
                                ServerError, TimeoutError, PubChemPyError):
                            pass

                        results = chemspider.search(
                            ent["inchikey"]) if chemspider_token else []
                        if results:
                            if len(results) == 1:
                                result = results[0]
                                ent["chs_common_name"] = result.common_name
                            ent["chs_cids_by_inchikey"] = "\"{}\"".format(
                                ",".join([str(c.csid) for c in results]))
                    else:
                        for search_field, col_pch, col_chs in [
                            ("smiles", "pch_cids_by_smiles",
                             "chs_cids_by_smiles"),
                            ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi")
                        ]:
                            results_pch = []
                            results_chs = []

                            if search_field == "smiles" and "smiles" in ent and ent[
                                    "smiles"] and "*" not in ent["smiles"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["smiles"], "smiles")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["smiles"]) if chemspider_token else []
                            elif search_field == "inchi" and "inchi" in ent and ent[
                                    "inchi"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["inchi"], "inchi")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["inchi"]) if chemspider_token else []

                            if results_pch:
                                ent[col_pch] = "\"{}\"".format(",".join(
                                    [str(c.cid) for c in results_pch]))
                            if results_chs:
                                ent[col_chs] = "\"{}\"".format(",".join(
                                    [str(c.csid) for c in results_chs]))

                            sleep(0.5)

            if output_file:
                dict_to_csv(to_return["content"],
                            output_file=output_file,
                            csv_delimiter=csv_delimiter,
                            write_header=write_header)

            if is_output_sdf:
                writer.close()
        elif not any(to_return["stdout"]) and output_file:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(compound_template_dict.keys()),
                             write_header=write_header)

        return to_return
Beispiel #26
0
class AggregateCLI(object):
    """Aggregate output TSV files (CLI)."""

    def __init__(self, args):

        self.args = args
        self.inputs = args.input

        if args.sdf:
            rdlogger.setLevel(4)
            self.output = SDWriter(args.output)
        else:
            self.output = open(args.output, 'w')

        self.mol_map = open(args.map_mols, 'w') if args.map_mols else None
        if self.mol_map:
            self.mol_map.write('MOLECULE_ID\tSCAFFOLD_ID\n')

        self.ann_map = open(args.map_annotations, 'w') if args.map_annotations else None
        if self.ann_map:
            self.ann_map.write('SCAFFOLD_ID\tANNOTATIONS\n')

        self.current_id = 0
        self.duplicates = 0
        self.table = {}

    def aggregate(self):
        if not self.args.sdf:
            self.output.write('ID\tHIERARCHY\tSMILES\tSUBSCAFFOLDS\n')
        for file in self.inputs:
            logger.info(f'Processing file: {file}...')
            with open(file, 'r') as fw:
                self.process_file(fw)

    def process_file(self, file):
        reader = ScaffoldFileIterator(file)
        for scaffold in reader:
            s_smiles = scaffold.smiles
            write = False
            if s_smiles in self.table:
                scaffold.id = self.table[s_smiles]['ID']
            else:
                scaffold.id = self.current_id
                self.table[s_smiles] = dict(ID=self.current_id, PARENTS=[])
                self.current_id += 1
                write = True
            missing = []
            for idx, parent in enumerate(scaffold.subscaffolds):
                p_smiles = parent.smiles
                if p_smiles in self.table:
                    parent.id = self.table[p_smiles]['ID']
                else:
                    missing.append(idx)
            for m in sorted(missing, reverse=True):
                del scaffold.subscaffolds[m]
            if write is True:
                self.write_scaffold(scaffold)
                self.write_extra_outputs(scaffold)
            else:
                self.duplicates += 1
                self.write_extra_outputs(scaffold)

    def write_scaffold(self, scaffold):
        subscaffolds = ', '.join([str(s.id) for s in scaffold.subscaffolds])
        if self.args.sdf:
            molecule = MolFromSmiles(scaffold.smiles)
            if molecule is not None:
                molecule.SetProp('_Name', str(scaffold.id))
                molecule.SetIntProp('HIERARCHY', scaffold.hierarchy)
                molecule.SetProp('SMILES', scaffold.smiles)
                molecule.SetProp('SUBSCAFFOLDS', subscaffolds)
                self.output.write(molecule)
            else:
                logger.warning(f'Failed to parse scaffold: {scaffold.smiles}')
        else:
            self.output.write('{0}\t{1}\t{2}\t{3}\n'.format(
                scaffold.id,
                scaffold.hierarchy,
                scaffold.smiles,
                subscaffolds))

    def write_extra_outputs(self, scaffold):
        # Write molecule --> scaffold ID file
        if self.mol_map is not None:
            for molecule in scaffold.molecules:
                self.mol_map.write('{0}\t{1}\n'.format(
                    molecule, scaffold.id))
        # Write scaffold ID --> annotation file
        if self.ann_map is not None:
            for annotation in scaffold.annotations:
                self.ann_map.write('{0}\t{1}\n'.format(scaffold.id, annotation))

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.output.close()
        if self.mol_map is not None:
            self.mol_map.close()
        if self.ann_map is not None:
            self.ann_map.close()
Beispiel #27
0
class SelectCLI(object):
    """Select scaffolds using a molecular query on an aggregated TSV (CLI)."""

    def __init__(self, args):

        self.args = args
        self.q_input = args.input_query
        self.g_input = open(args.input_graph, 'r')

        if args.sdf:
            rdlogger.setLevel(4)
            self.output = SDWriter(args.output)
        else:
            self.output = open(args.output, 'w')

        self.query = set()
        self.matching_parents = set()
        self.count = 0

    def select(self):
        if not self.args.sdf:
            self.output.write('ID\tHIERARCHY\tSMILES\tSUBSCAFFOLDS\n')
        self.load_query()
        logger.info('Processing query...')
        reader = ScaffoldFileIterator(self.g_input, reverse=True)
        for scaffold in reader:
            match = False
            if scaffold.smiles in self.query:
                match = True
            if scaffold.id in self.matching_parents:
                match = True
            if match is True:
                self.count += 1
                self.write_scaffold(scaffold)
                for s in scaffold.subscaffolds:
                    self.matching_parents.add(s.id)

    def load_query(self):
        logger.info('Reading molecular query...')
        file = None
        fmt = file_format(self.q_input)
        if fmt[0] == 'SMI':
            supplier = smiles.read_smiles_file(self.q_input)
        elif fmt[0] == 'SDF':
            rdlogger.setLevel(4)
            file = open(self.q_input, 'rb')
            supplier = sdf.read_sdf(file)
        else:
            raise ValueError('input file format not currently supported')
        for molecule in supplier:
            if molecule is not None:
                s = get_murcko_scaffold(molecule)
                self.query.add(MolToSmiles(s))
        if file is not None:
            file.close()
        logger.info(f'Read {len(self.query)} query scaffolds')

    def write_scaffold(self, scaffold):
        subscaffolds = ', '.join([str(s.id) for s in scaffold.subscaffolds])
        if self.args.sdf:
            molecule = MolFromSmiles(scaffold.smiles)
            if molecule is not None:
                molecule.SetProp('_Name', str(scaffold.id))
                molecule.SetIntProp('HIERARCHY', scaffold.hierarchy)
                molecule.SetProp('SMILES', scaffold.smiles)
                molecule.SetProp('SUBSCAFFOLDS', subscaffolds)
                self.output.write(molecule)
            else:
                logger.warning(f'Failed to parse scaffold: {scaffold.smiles}')
        else:
            self.output.write('{0}\t{1}\t{2}\t{3}\n'.format(
                scaffold.id,
                scaffold.hierarchy,
                scaffold.smiles,
                subscaffolds))

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.g_input.close()
        self.output.close()
Beispiel #28
0
            ligands_name.append(mol.GetProp('_Name'))

    ## Make directories for each transformation ##
        for pair in map_list:
            print('%s -> %s \n' % (pair[0], pair[1]))

            pair_dir = pair[0] + '_' + pair[1]
            os.mkdir(pair_dir)
            os.chdir(pair_dir)

            os.mkdir(dir_1_name)
            os.mkdir(dir_2_name)

            ## Write start ligand file, parameters ##
            os.chdir(dir_1_name)
            writer = SDWriter('for_parm.sdf')
            writer.write(ligands[ligands_name.index(pair[0])])
            writer.flush()
            run_antechamber(ligands[ligands_name.index(pair[0])],
                            'for_parm.sdf', ff)
            os.chdir('../')

            ## Check and fix XYZ coords of transform ligand ##
            fix_mol = update_atom_position(
                ligands[ligands_name.index(pair[0])],
                ligands[ligands_name.index(pair[1])])

            ## Write endpoint ligand file, parameters ##
            os.chdir(dir_2_name)
            writer = SDWriter('for_parm.sdf')
            writer.write(fix_mol)
    X, y = make_dataset(f'{prediction_set}.sdf',
                        data_dir=env_var,
                        features=features,
                        name_col=name_col,
                        endpoint=endpoint,
                        threshold=threshold,
                        cache=False)
    y = y.reindex(X_pred.index)
    y[y.isnull()] = final_preds

    y.to_csv(os.path.join(
        data_dir, 'predictions',
        f'{prediction_set}_{features}_{endpoint}_{threshold}_no_gaps.csv'),
             header=['Activities'])

    for molecule in molecules:
        if not molecule.HasProp(endpoint):
            molecule.SetProp(endpoint, str(y.loc[molecule.GetProp(name_col)]))

else:
    for molecule in molecules:
        molecule.SetProp(endpoint,
                         str(final_preds.loc[molecule.GetProp(name_col)]))

w = SDWriter(os.path.join(data_dir, f'{prediction_set}_with_predictions.sdf'))

for molecule in molecules:
    w.write(molecule)

w.close()