def filter_and_write(mol2_files, ids, output_dir, includelist_filter, verbose): for mol2_file in mol2_files: if verbose: sys.stdout.write('Processing %s' % os.path.basename(mol2_file)) sys.stdout.flush() if not os.path.exists(output_dir): os.mkdir(output_dir) mol2_outpath = os.path.join(output_dir, os.path.basename(mol2_file)) if mol2_outpath.endswith('.gz'): write_mode = 'wb' open_file = gzip.open else: write_mode = 'w' open_file = open with open_file(mol2_outpath, write_mode) as f: if verbose: start = time.time() if includelist_filter: if write_mode == 'w': for idx, mol2 in enumerate(split_multimol2(mol2_file)): if mol2[0] in ids: f.write(''.join(mol2[1])) else: for idx, mol2 in enumerate(split_multimol2(mol2_file)): if mol2[0].decode('utf-8') in ids: f.write(b''.join(mol2[1])) else: if write_mode == 'w': for idx, mol2 in enumerate(split_multimol2(mol2_file)): if mol2[0] not in ids: f.write(''.join(mol2[1])) else: for idx, mol2 in enumerate(split_multimol2(mol2_file)): if mol2[0].decode('utf-8') not in ids: f.write(b''.join(mol2[1])) if verbose: elapsed = time.time() - start n_molecules = idx + 1 sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' % (n_molecules, n_molecules / elapsed)) sys.stdout.flush()
def test_split_multimol2_gz(): all_mol2 = [] for i in split_multimol2(os.path.join(this_dir, 'data', '40_mol2_files.mol2.gz')): all_mol2.append(i[0]) assert(all_mol2[1].decode() == 'ZINC04084113') assert(len(all_mol2) == 40)
def run_rocs(source_file, target_file, n_processes, settings): prefix = ''.join(target_file.split('.mol2')[:-1]) sys.stdout.write('Processing %s\n' % os.path.basename(source_file)) sys.stdout.flush() for idx, mol2 in enumerate(split_multimol2(QUERY_FILE)): if idx >= 1: mcquery = 'true' break if not idx: mcquery = 'false' cmd = [EXECUTABLE, '-ref', QUERY_FILE, '-dbase', source_file, '-outputquery', 'false', '-prefix', prefix, '-mcquery', mcquery, '-mpi_np', str(n_processes), '-oformat', 'mol2'] if settings: for s in settings.split(): s = s.strip() if s: cmd.append(s) subprocess.call(cmd, stdout=subprocess.PIPE, bufsize=1)
def test_read_mol2_from_list(): data_path = os.path.join(this_dir, 'data', '40_mol2_files.mol2') mol2 = next(split_multimol2(data_path)) pdmol = PandasMol2().read_mol2_from_list(mol2_lines=mol2[1], mol2_code=mol2[0]) assert pdmol.df.shape == (65, 9) assert pdmol.code == 'ZINC38611810'
def check_query(query_path): ids = [mol2[0] for mol2 in split_multimol2(query_path)] n_ids = len(ids) if n_ids > 1: n_unique_ids = len(set(ids)) if n_unique_ids > 1: raise ValueError('Please Make sure that you only submit one' ' molecule or, if you submit a multi-conformer' ' query, that conformers of the molecule' ' have all the same molecule ID labels.' ' Found %d molecules and %d unique labels' % (n_ids, n_unique_ids))
def mol2_to_idfile(mol2_files, id_file_path, verbose=0): with open(id_file_path, 'w') as f: for mol2_file in mol2_files: if verbose: sys.stdout.write('Processing %s' % os.path.basename(mol2_file)) sys.stdout.flush() start = time.time() for idx, mol2 in enumerate(split_multimol2(mol2_file)): f.write(mol2[0] + '\n') if verbose: elapsed = time.time() - start n_molecules = idx + 1 sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' % (n_molecules, n_molecules / elapsed)) sys.stdout.flush()
def read_mol2(self, path, columns=None): mol2_code, mol2_lines = next(split_multimol2(path)) self._load_mol2(mol2_lines, mol2_code, columns) self.mol2_path = path return self
def run_code(): next(split_multimol2('40_mol2_files.pdb'))