def transform(self, data): """Take a string list in the extended gSpan format and yields NetworkX graphs. Parameters ---------- data : string or list data source, can be a list of strings, a file name or a url Returns ------- iterator over networkx graphs Raises ------ exception: when a graph is empty """ try: header = '' string_list = [] for line in util.read(data): if line.strip(): if line[0] in ['g', 't']: if string_list: yield self._gspan_to_networkx(header, string_list) string_list = [] header = line else: string_list += [line] if string_list: yield self._gspan_to_networkx(header, string_list) except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def _load_abalone(): print('abalone') uri = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' n_max = 700 M = [] labels = [] counter = 0 for line in read(uri): counter += 1 if counter > n_max: break line = line.strip() if line: items = line.split(',') label = int(items[-1]) // 7 labels.append(label) data = [float(x) for x in items[1:-1]] M.append(data) X = np.array(M) targets = LabelEncoder().fit_transform(labels) y = np.array(targets) y_sel = _select_targets(y, min_threshold=5) X, y = _filter_dataset(X, y, y_sel) return X, y
def _fasta_to_fasta(input): header = "" seq = "" const = "" for line in util.read(input): line = str(line).strip() if line == "": # assume the empty line indicates that next line describes the constraints if seq: yield seq seq = None elif line[0] == '>': if const: yield const header = "" seq = "" const = "" header = line yield header else: # remove trailing chars, split and take only first part, removing comments line_str = line.split()[0] if line_str: if seq is None: const += line_str else: seq += line_str if const: yield const
def _sdf_to_eden(self, iterable): for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf.strip()) # remove hydrogens mol.removeh() graph = self._obabel_to_networkx(mol) if len(graph): yield graph
def _sdf_to_eden(iterable): for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf.strip()) # remove hydrogens mol.removeh() graph = _obabel_to_networkx(mol) if len(graph): yield graph
def _smi_to_eden(self, iterable, cache={}): if self.split_components: # yield every graph separately for mol_smi in read(iterable): # First check if the molecule has appeared before and # thus is already converted if mol_smi not in cache: # convert from SMILES to SDF and store in cache command_string = 'obabel -:"' + mol_smi + \ '" -osdf --gen3d' args = shlex.split(command_string) sdf = subprocess.check_output(args) # Assume the incoming string contains only one molecule # Remove warning messages generated by openbabel sdf = '\n'.join( [x for x in sdf.split('\n') if 'WARNING' not in x]) cache[mol_smi] = sdf mols = self._generate_conformers(cache[mol_smi], self.n_conf) for molecule in mols: graph = self._obabel_to_networkx3d(molecule) if len(graph): yield graph else: # construct global graph and accumulate everything there global_graph = nx.Graph() for mol_smi in read(iterable): # First check if the molecule has appeared before and # thus is already converted if mol_smi not in cache: # convert from SMILES to SDF and store in cache command_string = 'obabel -:"' + mol_smi + \ '" -osdf --gen3d' args = shlex.split(command_string) sdf = subprocess.check_output(args) sdf = '\n'.join( [x for x in sdf.split('\n') if 'WARNING' not in x]) cache[mol_smi] = sdf mols = self._generate_conformers(cache[mol_smi], self.n_conf) for molecule in mols: g = self._obabel_to_networkx3d(molecule) if len(g): global_graph = nx.disjoint_union(global_graph, g) yield global_graph
def transform(self, data): """Transform.""" try: for serial_data in util.read(data): py_obj = json.loads(serial_data) graph = json_graph.node_link_graph(py_obj) yield graph except Exception as e: logger.debug('Failed iteration. Reason: %s' % e) logger.debug('Exception', exc_info=True)
def _smi_to_eden(self, iterable): for mol_smi in read(iterable): if self.smi_has_error(mol_smi) is False: mol = pybel.readstring("smi", mol_smi.strip()) # remove hydrogens mol.removeh() graph = self._obabel_to_networkx(mol) if len(graph): graph.graph['id'] = mol_smi.strip() yield graph
def _smi_to_eden(iterable): for mol_smi in read(iterable): if _smi_has_error(mol_smi) is False: mol = pybel.readstring("smi", mol_smi.strip()) # remove hydrogens mol.removeh() graph = _obabel_to_networkx(mol) if len(graph): graph.graph['id'] = mol_smi.strip() yield graph
def load_target(name): """Return a numpy array of integers to be used as target vector. Parameters ---------- name : string A pointer to the data source. """ target = [y.strip() for y in read(name) if y] return np.array(target).astype(int)
def _sdf_to_eden(self, iterable): if self.split_components: # yield every graph separately for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf) mols = self._generate_conformers(mol.write("sdf"), self.n_conf) for molecule in mols: molecule.removeh() graph = self._obabel_to_networkx3d(molecule) if len(graph): yield graph else: # construct a global graph and accumulate everything there global_graph = nx.Graph() for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf) mols = self._generate_conformers(mol.write("sdf"), self.n_conf) for molecule in mols: molecule.removeh() g = self._obabel_to_networkx3d(molecule) if len(g): global_graph = nx.disjoint_union(global_graph, g) yield global_graph
def node_link_data_to_eden(input=None, options=dict()): """ Takes a string list in the serialised node_link_data JSON format and yields networkx graphs. Parameters ---------- input : string A pointer to the data source. """ return _node_link_data_to_eden(util.read(input))
def word_sequence_to_eden(input=None, options=dict()): """ Takes a list of strings, splits each string in words and yields networkx graphs. Parameters ---------- input : string A pointer to the data source. """ for word_sequence in util.read(input): yield word_sequence_to_networkx(word_sequence)
def sequence_to_eden(input=None, options=dict()): """ Takes a list of strings and yields networkx graphs. Parameters ---------- input : string A pointer to the data source. """ for sequence in read(input): yield sequence_to_networkx(sequence)
def obabel_to_eden(iterable, file_format='sdf', **options): """ Takes a string list in sdf format format and yields networkx graphs. Parameters ---------- iterable : SMILES strings containing molecular structures. """ def smi_has_error(smi): smi = smi.strip() n_open_parenthesis = sum(1 for c in smi if c == '(') n_close_parenthesis = sum(1 for c in smi if c == ')') n_open_parenthesis_square = sum(1 for c in smi if c == '[') n_close_parenthesis_square = sum(1 for c in smi if c == ']') return (n_open_parenthesis != n_close_parenthesis) or \ (n_open_parenthesis_square != n_close_parenthesis_square) if file_format == 'sdf': for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf.strip()) # remove hydrogens mol.removeh() graph = obabel_to_networkx(mol) if len(graph): yield graph elif file_format == 'smi': for mol_smi in read(iterable): if smi_has_error(mol_smi) is False: mol = pybel.readstring("smi", mol_smi.strip()) # remove hydrogens mol.removeh() graph = obabel_to_networkx(mol) if len(graph): graph.graph['info'] = mol_smi.strip() yield graph else: raise Exception('ERROR: unrecognized file format: %s' % file_format)
def _fasta_to_fasta(input): seq = "" for line in util.read(input): if line: if line[0] == '>': if seq: yield seq seq = "" line_str = str(line) yield line_str.strip() else: line_str = line.split() if line_str: seq += str(line_str[0]).strip() if seq: yield seq
def _fasta_to_fasta(self, data): seq = "" for line in util.read(data): if line: if line[0] == '>': line = line[1:] if seq: yield seq seq = "" line_str = str(line) yield line_str.strip() else: line_str = line.split() if line_str: seq += str(line_str[0]).strip() if seq: yield seq
def _load_data(uri): M = [] labels = [] counter = 0 for line in read(uri): counter += 1 if counter > n_max: break line = line.strip() if line: items = line.split(' ') label = hash(items[-1]) & 13 labels.append(label) data = [float(x) for x in items[:-1]] M.append(data) X = np.array(M) y = np.array(labels) return X, y
def _load_wine(): print('wine') uri = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' M = [] labels = [] for line in read(uri): line = line.strip() if line: items = line.split(',') label = int(items[0]) labels.append(label) data = [float(x) for x in items[1:]] M.append(data) X = scale(np.array(M)) targets = LabelEncoder().fit_transform(labels) y = np.array(targets) return X, y
def _load_wdbc(): print('breast-cancer-wisconsin') uri = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data' from eden.util import read M = [] labels = [] for line in read(uri): line = line.strip() if line: items = line.split(',') label = str(items[1]) labels.append(label) data = [float(x) for x in items[2:]] M.append(data) import numpy as np from sklearn.preprocessing import normalize, scale X = scale(np.array(M)) from sklearn.preprocessing import LabelEncoder targets = LabelEncoder().fit_transform(labels) y = np.array(targets) return X, y
def gspan_to_eden(input=None, options=dict()): """ Takes a string list in the extended gSpan format and yields networkx graphs. Parameters ---------- input : string A pointer to the data source. """ header = '' string_list = [] for line in util.read(input): if line.strip(): if line[0] in ['g', 't']: if string_list: yield gspan_to_networkx(header, string_list) string_list = [] header = line string_list += [line] if string_list: yield gspan_to_networkx(header, string_list)
def gspan_to_eden(input, options=dict()): """Take a string list in the extended gSpan format and yields NetworkX graphs. Args: input: data source, can be a list of strings, a file name or a url Returns: NetworkX graph generator Raises: Exception: if a graph is empty """ header = '' string_list = [] for line in util.read(input): if line.strip(): if line[0] in ['g', 't']: if string_list: yield gspan_to_networkx(header, string_list) string_list = [] header = line else: string_list += [line] if string_list: yield gspan_to_networkx(header, string_list)
def _load_ionosphere(): print('ionosphere') uri = 'http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data' n_max = 700 M = [] labels = [] counter = 0 for line in read(uri): counter += 1 if counter > n_max: break line = line.strip() if line: items = line.split(',') label = hash(items[-1]) labels.append(label) data = [float(x) for x in items[:-1]] M.append(data) X = (np.array(M)) targets = LabelEncoder().fit_transform(labels) y = np.array(targets) return X, y
def obabel_to_eden3d(iterable, file_format='sdf', cache={}, split_components=True, **kwargs): """Take an iterable file and yields the corresponding networkx graphs. **kwargs: arguments to be passed to other methods. """ n_conf = kwargs.get('n_conf', 0) if file_format == 'sdf': if split_components: # yield every graph separately for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf) mols = generate_conformers(mol.write("sdf"), n_conf) for molecule in mols: molecule.removeh() graph = obabel_to_networkx3d(molecule, **kwargs) if len(graph): yield graph else: # construct a global graph and accumulate everything there global_graph = nx.Graph() for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf) mols = generate_conformers(mol.write("sdf"), n_conf) for molecule in mols: molecule.removeh() g = obabel_to_networkx3d(molecule, **kwargs) if len(g): global_graph = nx.disjoint_union(global_graph, g) yield global_graph elif file_format == 'smi': if split_components: # yield every graph separately for mol_smi in read(iterable): # First check if the molecule has appeared before and thus is # already converted if mol_smi not in cache: # convert from SMILES to SDF and store in cache command_string = 'obabel -:"' + mol_smi + '" -osdf --gen3d' args = shlex.split(command_string) sdf = subprocess.check_output(args) # Assume the incoming string contains only one molecule # Remove warning messages generated by openbabel sdf = '\n'.join( [x for x in sdf.split('\n') if 'WARNING' not in x]) cache[mol_smi] = sdf mols = generate_conformers(cache[mol_smi], n_conf) for molecule in mols: graph = obabel_to_networkx3d(molecule, **kwargs) if len(graph): yield graph else: # construct global graph and accumulate everything there global_graph = nx.Graph() for mol_smi in read(iterable): # First check if the molecule has appeared before and thus is # already converted if mol_smi not in cache: # convert from SMILES to SDF and store in cache command_string = 'obabel -:"' + mol_smi + '" -osdf --gen3d' args = shlex.split(command_string) sdf = subprocess.check_output(args) sdf = '\n'.join( [x for x in sdf.split('\n') if 'WARNING' not in x]) cache[mol_smi] = sdf mols = generate_conformers(cache[mol_smi], n_conf) for molecule in mols: g = obabel_to_networkx3d(molecule, **kwargs) if len(g): global_graph = nx.disjoint_union(global_graph, g) yield global_graph else: raise Exception('ERROR: unrecognized file format: %s' % file_format)
def obabel_to_eden3d(iterable, file_format='sdf', cache={}, split_components=True, **kwargs): """ Takes an iterable file and yields the corresponding networkx graphs. **kwargs: arguments to be passed to other methods. """ n_conf = kwargs.get('n_conf', 0) if file_format == 'sdf': if split_components: # yield every graph separately for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf) mols = generate_conformers(mol.write("sdf"), n_conf) for molecule in mols: molecule.removeh() graph = obabel_to_networkx3d(molecule, **kwargs) if len(graph): yield graph else: # construct a global graph and accumulate everything there global_graph = nx.Graph() for mol_sdf in read(iterable): mol = pybel.readstring("sdf", mol_sdf) mols = generate_conformers(mol.write("sdf"), n_conf) for molecule in mols: molecule.removeh() g = obabel_to_networkx3d(molecule, **kwargs) if len(g): global_graph = nx.disjoint_union(global_graph, g) yield global_graph elif file_format == 'smi': if split_components: # yield every graph separately for mol_smi in read(iterable): # First check if the molecule has appeared before and thus is # already converted if mol_smi not in cache: # convert from SMILES to SDF and store in cache command_string = 'obabel -:"' + mol_smi + '" -osdf --gen3d' args = shlex.split(command_string) sdf = subprocess.check_output(args) # Assume the incoming string contains only one molecule # Remove warning messages generated by openbabel sdf = '\n'.join( [x for x in sdf.split('\n') if 'WARNING' not in x]) cache[mol_smi] = sdf mols = generate_conformers(cache[mol_smi], n_conf) for molecule in mols: graph = obabel_to_networkx3d(molecule, **kwargs) if len(graph): yield graph else: # construct global graph and accumulate everything there global_graph = nx.Graph() for mol_smi in read(iterable): # First check if the molecule has appeared before and thus is # already converted if mol_smi not in cache: # convert from SMILES to SDF and store in cache command_string = 'obabel -:"' + mol_smi + '" -osdf --gen3d' args = shlex.split(command_string) sdf = subprocess.check_output(args) sdf = '\n'.join( [x for x in sdf.split('\n') if 'WARNING' not in x]) cache[mol_smi] = sdf mols = generate_conformers(cache[mol_smi], n_conf) for molecule in mols: g = obabel_to_networkx3d(molecule, **kwargs) if len(g): global_graph = nx.disjoint_union(global_graph, g) yield global_graph else: raise Exception('ERROR: unrecognized file format: %s' % file_format)