def smart_load_dictionary(name=None, file_path=None, url=None, registry_location=cifdic_register_url, save_local=False, store_dir=None): from iotbx import cif assert [name, file_path, url].count(None) < 3 cif_dic = None if store_dir is None: store_dir = libtbx.env.under_dist(module_name='iotbx', path='cif/dictionaries') if name is not None and [file_path, url].count(None) == 2: if file_path is None: if os.path.isfile(name): file_path = name else: file_path = os.path.join(store_dir, name) if not os.path.isfile(file_path): gzip_path = file_path + '.gz' if os.path.isfile(gzip_path): if save_local: gz = smart_open.for_reading(gzip_path) f = smart_open.for_writing(file_path) shutil.copyfileobj(gz, f) gz.close() f.close() else: file_path = gzip_path if file_path is not None and os.path.isfile(file_path): file_object = smart_open.for_reading(file_path) cif_dic = dictionary(cif.reader(file_object=file_object).model()) file_object.close() else: if url is None: url = locate_dictionary(name, registry_location=registry_location) file_object = urlopen(url) if save_local: if name is None: name = os.path.basename(url) f = open(os.path.join(store_dir, name), 'wb') shutil.copyfileobj(file_object, f) f.close() cif_dic = dictionary( cif.reader(file_path=os.path.join(store_dir, name)).model()) else: cif_dic = dictionary(cif.reader(file_object=file_object).model()) assert cif_dic is not None return cif_dic
def __init__(self, file_path=None, file_object=None, input_string=None, cif_object=None, builder=None, raise_if_errors=True, strict=True): assert [file_path, file_object, input_string].count(None) == 2 self.file_path = file_path if builder is None: builder = builders.cif_model_builder(cif_object) else: assert cif_object is None self.builder = builder if file_path is not None: file_object = smart_open.for_reading(file_path) else: file_path = "memory" if file_object is not None: input_string = file_object.read() # check input_string for binary, and abort if necessary binary_detector = detect_binary_file() binary_detector.monitor_initial = min(len(input_string), binary_detector.monitor_initial) if binary_detector.is_binary_file(block=input_string): raise CifParserError("Binary file detected, aborting parsing.") self.parser = ext.fast_reader(builder, input_string, file_path, strict) if raise_if_errors and len(self.parser.lexer_errors()): raise CifParserError(self.parser.lexer_errors()[0]) if raise_if_errors and len(self.parser.parser_errors()): raise CifParserError(self.parser.parser_errors()[0])
def __init__(self, file_path=None, file_object=None, input_string=None, cif_object=None, builder=None, raise_if_errors=True, strict=True): assert [file_path, file_object, input_string].count(None) == 2 self.file_path = file_path if builder is None: builder = builders.cif_model_builder(cif_object) else: assert cif_object is None self.builder = builder if file_path is not None: file_object = smart_open.for_reading(file_path) else: file_path = "memory" if file_object is not None: input_string = file_object.read() # check input_string for binary, and abort if necessary binary_detector = detect_binary_file() binary_detector.monitor_initial = min( len(input_string), binary_detector.monitor_initial) if binary_detector.is_binary_file(block=input_string): raise CifParserError("Binary file detected, aborting parsing.") self.parser = ext.fast_reader(builder, input_string, file_path, strict) if raise_if_errors and len(self.parser.lexer_errors()): raise CifParserError(self.parser.lexer_errors()[0]) if raise_if_errors and len(self.parser.parser_errors()): raise CifParserError(self.parser.parser_errors()[0])
def __init__(self, file_object=None, file_name=None): assert [file_object, file_name].count(None) == 1 if (file_object is None): from libtbx import smart_open file_object = smart_open.for_reading(file_name=file_name) from cctbx.array_family import flex super(reader, self).__init__(lines=flex.split_lines(file_object.read()))
def smart_load_dictionary(name=None, file_path=None, url=None, registry_location=cifdic_register_url, save_local=False, store_dir=None): from iotbx import cif assert [name, file_path, url].count(None) < 3 cif_dic = None if store_dir is None: store_dir = libtbx.env.under_dist( module_name='iotbx', path='cif/dictionaries') if name is not None and [file_path, url].count(None) == 2: if file_path is None: if os.path.isfile(name): file_path = name else: file_path = os.path.join(store_dir, name) if not os.path.isfile(file_path): gzip_path = file_path + '.gz' if os.path.isfile(gzip_path): if save_local: gz = smart_open.for_reading(gzip_path) f = smart_open.for_writing(file_path) shutil.copyfileobj(gz, f) gz.close() f.close() else: file_path = gzip_path if file_path is not None and os.path.isfile(file_path): file_object = smart_open.for_reading(file_path) cif_dic = dictionary(cif.reader(file_object=file_object).model()) file_object.close() else: if url is None: url = locate_dictionary(name, registry_location=registry_location) file_object = urlopen(url) if save_local: if name is None: name = os.path.basename(url) f = open(os.path.join(store_dir, name), 'wb') shutil.copyfileobj(file_object, f) f.close() cif_dic = dictionary(cif.reader( file_path=os.path.join(store_dir, name)).model()) else: cif_dic = dictionary(cif.reader( file_object=file_object).model()) assert cif_dic is not None return cif_dic
def from_msgpack_file(filename): ''' Read the reflection table from file in msgpack format ''' from libtbx import smart_open with smart_open.for_reading(filename, 'rb') as infile: return reflection_table.from_msgpack(infile.read())
def __init__(self): """ Loads pickle with data. Path is temporary in current work dir. Should be centralized somewhere else upon going to production. """ db_dict = {} pdb_info_file = libtbx.env.find_in_repositories( relative_path="cctbx_project/iotbx/bioinformatics/pdb_info.csv.gz", test=os.path.isfile) csv_file = smart_open.for_reading(file_name=pdb_info_file) csv_reader = csv.reader(csv_file, delimiter=";") for row in csv_reader: db_dict[row[0]] = (row[1], row[2], row[3], row[4], row[5]) self.db_dict = db_dict
def extract_remark_2_and_3_records(file_name, file_lines=None): result = [] if (file_lines is None): file_lines = smart_open.for_reading( file_name=file_name).read().splitlines() else: assert (file_name is None) for rec in file_lines: if (rec.startswith("REMARK 3 ") or rec.startswith("REMARK 2 ")): start = True result.append(rec) else: if (rec.startswith("ATOM ") or rec.startswith("HETATM ")): break return result
def extract_remark_2_and_3_records(file_name, file_lines=None): result = [] if (file_lines is None) : file_lines = smart_open.for_reading( file_name = file_name).read().splitlines() else : assert (file_name is None) for rec in file_lines: if(rec.startswith("REMARK 3 ") or rec.startswith("REMARK 2 ")): start = True result.append(rec) else: if(rec.startswith("ATOM ") or rec.startswith("HETATM ")): break return result
def from_pickle(filename): ''' Read the reflection table from pickle file. :param filename: The pickle filename :return: The reflection table ''' import six.moves.cPickle as pickle from libtbx import smart_open with smart_open.for_reading(filename, 'rb') as infile: result = pickle.load(infile) assert (isinstance(result, reflection_table)) return result
def from_pickle(filename): ''' Read the reflection table from pickle file. :param filename: The pickle filename :return: The reflection table ''' import cPickle as pickle from libtbx import smart_open with smart_open.for_reading(filename, 'rb') as infile: result = pickle.load(infile) assert(isinstance(result, reflection_table)) return result
def run(args): for f in args: try: file_object = smart_open.for_reading(file_name=f) miller_arrays = iotbx.cif.reader(file_object=file_object).as_miller_arrays() except KeyboardInterrupt: raise except Exception, e: print "Error extracting miller arrays from file: %s:" % ( show_string(f)) print " ", str(e) continue for miller_array in miller_arrays: miller_array.show_comprehensive_summary() print r, _ = op.splitext(op.basename(f)) easy_pickle.dump(file_name=r+'_miller_arrays.pickle', obj=miller_arrays)
def run(): data_dir = '/net/cci/youval/Work/work/MTRIX/Data' #data_dir = r'c:\Phenix\Dev\Work\work\MTRIX\Data' os.chdir(data_dir) file_to_year_dict = {} files_with_good_MTRIX = set(pickle.load(open(os.path.join(data_dir,'files_with_good_MTRIX'),'r'))) good_MTRIX_pdb_files = pickle.load(open(os.path.join(data_dir,'dict_good_MTRIX_pdb_files'),'r')) # find the file in LBL pdb mirror folder for fn in files_with_good_MTRIX: file_name_with_path = good_MTRIX_pdb_files[fn] file_lines = smart_open.for_reading( file_name = file_name_with_path).read().splitlines() year = get_year(file_lines) file_to_year_dict[fn] = year print len(file_to_year_dict)
def extract_from(file_name=None, file=None, monitor_initial=None): assert [file_name, file].count(None) == 1 if (file is None): file = smart_open.for_reading(file_name=file_name) detect_binary = detect_binary_file(monitor_initial=monitor_initial) line_number = 0 for line in file: line_number += 1 if (detect_binary is not None): is_binary = detect_binary.is_binary_file(block=line) if (is_binary is not None): if (is_binary): break detect_binary = None if (line.startswith("CRYST1")): return cryst1_interpretation.crystal_symmetry(cryst1_record=line) crystal_symmetry = cns_pdb_remarks.extract_symmetry(pdb_record=line) if (crystal_symmetry is not None): return crystal_symmetry raise RuntimeError("No CRYST1 record.")
def fetch(id, data_type="pdb", format="pdb", mirror="rcsb", log=None, force_download=False, local_cache=None): """ Locate and open a data file for the specified PDB ID and format, either in a local mirror or online. :param id: 4-character PDB ID (e.g. '1hbb') :param data_type: type of content to download: pdb, xray, or fasta :param format: format of data: cif, pdb, or xml :param mirror: remote site to use, either rcsb, pdbe, pdbj or pdb-redo :returns: a filehandle-like object (with read() method) """ assert data_type in ["pdb", "xray", "fasta", "seq"] assert format in ["cif", "pdb", "xml"] assert mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"] validate_pdb_id(id) if (log is None): log = null_out() id = id.lower() if (not force_download): if (local_cache is not None) and (data_type == "pdb"): from iotbx.file_reader import guess_file_type if (local_cache is Auto): local_cache = os.getcwd() cache_files = os.listdir(local_cache) for file_name in cache_files: if (len(file_name) > 4): file_id = re.sub("^pdb", "", file_name)[0:4] if (file_id.lower() == id): if (guess_file_type(file_name) == "pdb"): file_name = os.path.join(local_cache, file_name) print >> log, "Reading from cache directory:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # try local mirror for PDB and X-ray data files first, if it exists if (data_type == "pdb") and (format == "pdb") and \ ("PDB_MIRROR_PDB" in os.environ) : subdir = os.path.join(os.environ["PDB_MIRROR_PDB"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "pdb%s.ent.gz" % id) if (os.path.isfile(file_name)): print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f if (data_type == "pdb") and (format == "cif") and \ ("PDB_MIRROR_MMCIF" in os.environ) : subdir = os.path.join(os.environ["PDB_MIRROR_MMCIF"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "%s.cif.gz" % id) if (os.path.isfile(file_name)): print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f if ((data_type == "xray") and ("PDB_MIRROR_STRUCTURE_FACTORS" in os.environ)): sf_dir = os.environ["PDB_MIRROR_STRUCTURE_FACTORS"] subdir = os.path.join(sf_dir, id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "r%ssf.ent.gz" % id) if (os.path.isfile(file_name)): print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # No mirror found (or out of date), default to HTTP download url = None compressed = False if (mirror == "rcsb"): url_base = 'https://files.rcsb.org/download/' pdb_ext = ".pdb" sf_prefix = "" sf_ext = "-sf.cif" elif (mirror == "pdbe"): url_base = "https://www.ebi.ac.uk/pdbe-srv/view/files/" pdb_ext = ".ent" sf_prefix = "r" sf_ext = "sf.ent" elif (mirror == "pdbj"): url_base = "ftp://ftp.pdbj.org/pub/pdb/data/structures/divided/" if (data_type == "pdb"): compressed = True if (format == "pdb"): url = url_base + "pdb/%s/pdb%s.ent.gz" % (id[1:3], id) elif (format == "cif"): url = url_base + "mmCIF/%s/%s.cif.gz" % (id[1:3], id) elif (data_type == "xray"): compressed = True url = url_base + "structure_factors/%s/r%ssf.ent.gz" % (id[1:3], id) elif (data_type in ["fasta", "seq"]): url = "https://pdbj.org/rest/downloadPDBfile?format=fasta&id=%s" % id if (url is None) and (data_type != "fasta"): raise Sorry( "Can't determine PDBj download URL for this data/format " + "combination.") elif mirror == "pdb-redo": url_base = "https://pdb-redo.eu/db/" pdb_ext = "_final.pdb" cif_ext = "_final.cif" sf_prefix = "" sf_ext = "_final.mtz" if (data_type == 'pdb'): if (format == 'pdb'): url = url_base + "{id}/{id}{format}".format(id=id, format=pdb_ext) elif (format == 'cif'): url = url_base + "{id}/{id}{format}".format(id=id, format=cif_ext) elif (data_type == 'xray'): url = url_base + "{id}/{id}{format}".format(id=id, format=sf_ext) if (data_type in ["fasta", "seq"]): # XXX the RCSB doesn't appear to have a simple URL for FASTA files if (url is None): # TODO PDBe equivalent doesn't exist? url = "https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=%s&compressionType=uncompressed" % id try: data = libtbx.utils.urlopen(url) except urllib2.HTTPError, e: if e.getcode() == 404: raise RuntimeError("Couldn't download sequence for %s." % id) else: raise
def fetch (id, data_type="pdb", format="pdb", mirror="rcsb", log=None, force_download=False, local_cache=None) : """ Locate and open a data file for the specified PDB ID and format, either in a local mirror or online. :param id: 4-character PDB ID (e.g. '1hbb') :param data_type: type of content to download: pdb, xray, or fasta :param format: format of data: cif, pdb, or xml :param mirror: remote site to use, either rcsb or pdbe :returns: a filehandle-like object (with read() method) """ assert data_type in ["pdb", "xray", "fasta", "seq"] assert format in ["cif", "pdb", "xml"] assert mirror in ["rcsb", "pdbe", "pdbj"] validate_pdb_id(id) if (log is None) : log = null_out() id = id.lower() if (not force_download) : if (local_cache is not None) and (data_type == "pdb") : from iotbx.file_reader import guess_file_type if (local_cache is Auto) : local_cache = os.getcwd() cache_files = os.listdir(local_cache) for file_name in cache_files : if (len(file_name) > 4) : file_id = re.sub("^pdb", "", file_name)[0:4] if (file_id.lower() == id) : if (guess_file_type(file_name) == "pdb") : file_name = os.path.join(local_cache, file_name) print >> log, "Reading from cache directory:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # try local mirror for PDB and X-ray data files first, if it exists if (data_type == "pdb") and ("PDB_MIRROR_PDB" in os.environ) : subdir = os.path.join(os.environ["PDB_MIRROR_PDB"], id[1:3]) if (os.path.isdir(subdir)) : file_name = os.path.join(subdir, "pdb%s.ent.gz" % id) if (os.path.isfile(file_name)) : print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f if ((data_type == "xray") and ("PDB_MIRROR_STRUCTURE_FACTORS" in os.environ)) : sf_dir = os.environ["PDB_MIRROR_STRUCTURE_FACTORS"] subdir = os.path.join(sf_dir, id[1:3]) if (os.path.isdir(subdir)) : file_name = os.path.join(subdir, "r%ssf.ent.gz" % id) if (os.path.isfile(file_name)) : print >> log, "Reading from local mirror:" print >> log, " " + file_name f = smart_open.for_reading(file_name) return f # No mirror found (or out of date), default to HTTP download url = None compressed = False if (mirror == "rcsb") : url_base = "http://www.rcsb.org/pdb/files/" pdb_ext = ".pdb" sf_prefix = "" sf_ext = "-sf.cif" elif (mirror == "pdbe") : url_base = "http://www.ebi.ac.uk/pdbe-srv/view/files/" pdb_ext = ".ent" sf_prefix = "r" sf_ext = "sf.ent" elif (mirror == "pdbj") : url_base = "ftp://ftp.pdbj.org/pub/pdb/data/structures/divided/" if (data_type == "pdb") : compressed = True if (format == "pdb") : url = url_base + "pdb/%s/pdb%s.ent.gz" % (id[1:3], id) elif (format == "cif") : url = url_base + "mmCIF/%s/%s.cif.gz" % (id[1:3], id) elif (data_type == "xray") : compressed = True url = url_base + "structure_factors/%s/r%ssf.ent.gz" % (id[1:3], id) elif (data_type in ["fasta", "seq"]) : url = "http://pdbj.org/app//downloadFasta4PDBID?pdbid=%s" % id if (url is None) and (data_type != "fasta") : raise Sorry("Can't determine PDBj download URL for this data/format "+ "combination.") if (data_type in ["fasta", "seq"]) : # XXX the RCSB doesn't appear to have a simple URL for FASTA files if (url is None) : # TODO PDBe equivalent doesn't exist? url = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=FASTA&compression=NO&structureId=%s" % id try : data = libtbx.utils.urlopen(url) except urllib2.HTTPError, e : if e.getcode() == 404 : raise RuntimeError("Couldn't download sequence for %s." % id) else : raise
def fetch(id, data_type="pdb", format="pdb", mirror="rcsb", log=None, force_download=False, local_cache=None): """ Locate and open a data file for the specified PDB ID and format, either in a local mirror or online. :param id: 4-character PDB ID (e.g. '1hbb') :param data_type: type of content to download: pdb, xray, or fasta :param format: format of data: cif, pdb, or xml (or cif_or_pdb) :param mirror: remote site to use, either rcsb, pdbe, pdbj or pdb-redo :returns: a filehandle-like object (with read() method) """ assert data_type in ["pdb", "xray", "fasta", "seq"] assert format in ["cif", "pdb", "xml", "cif_or_pdb"] assert mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"] validate_pdb_id(id) if (log is None): log = null_out() id = id.lower() if (not force_download): if (local_cache is not None) and (data_type == "pdb"): from iotbx.file_reader import guess_file_type if (local_cache is Auto): local_cache = os.getcwd() cache_files = os.listdir(local_cache) for file_name in cache_files: if (len(file_name) > 4): file_id = re.sub("^pdb", "", file_name)[0:4] if (file_id.lower() == id): if (guess_file_type(file_name) == "pdb"): file_name = os.path.join(local_cache, file_name) print("Reading from cache directory:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f # try local mirror for PDB and X-ray data files first, if it exists if (data_type == "pdb") and (format in ["pdb", "cif_or_pdb"]) and \ ("PDB_MIRROR_PDB" in os.environ): subdir = os.path.join(os.environ["PDB_MIRROR_PDB"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "pdb%s.ent.gz" % id) if (os.path.isfile(file_name)): print("Reading from local mirror:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f if (data_type == "pdb") and (format in ["cif", "cif_or_pdb"]) and \ ("PDB_MIRROR_MMCIF" in os.environ): subdir = os.path.join(os.environ["PDB_MIRROR_MMCIF"], id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "%s.cif.gz" % id) if (os.path.isfile(file_name)): print("Reading from local mirror:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f if ((data_type == "xray") and ("PDB_MIRROR_STRUCTURE_FACTORS" in os.environ)): sf_dir = os.environ["PDB_MIRROR_STRUCTURE_FACTORS"] subdir = os.path.join(sf_dir, id[1:3]) if (os.path.isdir(subdir)): file_name = os.path.join(subdir, "r%ssf.ent.gz" % id) if (os.path.isfile(file_name)): print("Reading from local mirror:", file=log) print(" " + file_name, file=log) f = smart_open.for_reading(file_name) return f # No mirror found (or out of date), default to HTTP download url = None compressed = False if (mirror == "rcsb"): url_base = 'https://files.rcsb.org/download/' pdb_ext = ".pdb" sf_prefix = "" sf_ext = "-sf.cif" elif (mirror == "pdbe"): url_base = "https://www.ebi.ac.uk/pdbe-srv/view/files/" pdb_ext = ".ent" sf_prefix = "r" sf_ext = "sf.ent" elif (mirror == "pdbj"): url_base = "ftp://ftp.pdbj.org/pub/pdb/data/structures/divided/" if (data_type == "pdb"): compressed = True if (format == "pdb"): url = url_base + "pdb/%s/pdb%s.ent.gz" % (id[1:3], id) elif (format in ["cif", "cif_or_pdb"]): url = url_base + "mmCIF/%s/%s.cif.gz" % (id[1:3], id) elif (data_type == "xray"): compressed = True url = url_base + "structure_factors/%s/r%ssf.ent.gz" % (id[1:3], id) elif (data_type in ["fasta", "seq"]): url = "https://pdbj.org/rest/downloadPDBfile?format=fasta&id=%s" % id if (url is None) and (data_type != "fasta"): raise Sorry( "Can't determine PDBj download URL for this data/format " + "combination.") elif mirror == "pdb-redo": url_base = "https://pdb-redo.eu/db/" pdb_ext = "_final.pdb" cif_ext = "_final.cif" sf_prefix = "" sf_ext = "_final.mtz" if (data_type == 'pdb'): if (format == 'pdb'): url = url_base + "{id}/{id}{format}".format(id=id, format=pdb_ext) elif (format in ['cif', 'cif_or_pdb']): url = url_base + "{id}/{id}{format}".format(id=id, format=cif_ext) elif (data_type == 'xray'): url = url_base + "{id}/{id}{format}".format(id=id, format=sf_ext) if (data_type in ["fasta", "seq"]): if (url is None): # TODO PDBe equivalent doesn't exist? # Seems that this url should be working: url = "https://www.rcsb.org/fasta/entry/%s" % id try: data = libtbx.utils.urlopen(url) except HTTPError as e: if e.getcode() == 404: raise RuntimeError("Couldn't download sequence for %s." % id) else: raise elif data_type == "xray": if (url is None): url = url_base + sf_prefix + id + sf_ext try: data = libtbx.utils.urlopen(url) except HTTPError as e: if e.getcode() == 404: raise RuntimeError( "Couldn't download structure factors for %s." % id) else: raise else: if (url is None): if format == "pdb": url = url_base + id + pdb_ext elif format == "cif_or_pdb": url = url_base + id + "." + "cif" else: url = url_base + id + "." + format try: data = libtbx.utils.urlopen(url) except HTTPError as e: if e.getcode() == 404: raise RuntimeError("Couldn't download model for %s." % id) else: raise if (compressed): try: import gzip except ImportError: raise Sorry( "gzip module not available - please use an uncompressed " + "source of PDB data.") else: # XXX due to a bug in urllib2, we can't pass the supposedly file-like # object directly, so we read the data into a StringIO object instead return gzip.GzipFile(fileobj=StringIO(data.read())) return data