def __init__(self, file_path=None, compress=False, mode='w', preserve_order=False): # """""" # #orig # self._handle = openGzip( # file_path, # 'w') if file_path is not None else file_path #new self.compress = compress self.mode = 'wb' if (compress and mode == 'w') else mode self.preserve_token_order = preserve_order if (file_path and isinstance(file_path, str)) or file_path is None: file_path = ( file_path if (file_path and isinstance(file_path, str) and not compress) else ( file_path+".gz" if (file_path and isinstance(file_path, str) and not file_path.endswith(".gz") and compress) else file_path ) ) self._handle = openGzip( file_path, mode) if file_path is not None else file_path else: from exceptions import TypeError raise TypeError("file_path argument is not a string") self.verbose = False # TODO: Not implemented
def _exportCifFile(self, file_path, token_ordering): """""" cf = None if file_path is not None: cif_file = openGzip(file_path, 'r') tokeniser = StarTokeniser() tokeniser.start_matching(cif_file) if cif_file: cf = CifFile(file_path, preserve_token_order=token_ordering) db = None sf = None cc = None ci = None loopItems = [] loopValues = [] loop_state = False save_state = False loop_value_state = False # Keller tokenizer provides the following tokens: # "", "MULTILINE", "COMMENT", "GLOBAL", "SAVE_FRAME", "SAVE_FRAME_REF", # "LOOP_STOP", "DATA_BLOCK", "LOOP", "BAD_CONSTRUCT", "DATA_NAME", "SQUOTE_STRING", # "DQUOTE_STRING", "NULL", "UNKNOWN", "SQUARE_BRACKET", "STRING", "BAD_TOKEN" DATA_TOKENS = ["MULTILINE", "SQUOTE_STRING", "DQUOTE_STRING", "NULL", "UNKNOWN", "STRING"] # NB: Square bracket types are not currently handled for tok in tokeniser: if tok.type_string == 'BAD_TOKEN': raise BadStarTokenError(tok) if tok.type_string == 'DATA_BLOCK': db = cf.setDataBlock(tok.value[tok.value.find('_')+1:]) loop_state = False save_state = False elif tok.type_string == 'LOOP': loop_value_state = False if not loop_state: loop_state = True if loopValues != []: self._processLoop(cc, loopItems, loopValues) loopItems = [] loopValues = [] elif tok.type_string == 'SAVE_FRAME': if save_state: save_state = False else: sf = db.setSaveFrame(tok.value[tok.value.find('_')+1:]) save_state = True if loop_state: loop_state = False if loopValues != []: self._processLoop(cc, loopItems, loopValues) loopItems = [] loopValues = [] elif tok.type_string == 'DATA_NAME': [category_name, item_name] = tok.value.split('.') if loop_value_state: loop_state = False loop_value_state = False if loopValues != []: self._processLoop(cc, loopItems, loopValues) loopItems = [] loopValues = [] if not save_state: cc = db.setCategory(category_name) else: cc = sf.setCategory(category_name) if loop_state: loopItems.append(item_name) ci = cc.setItem(item_name) elif tok.type_string in DATA_TOKENS: # It's a data contatining token token_value = tok.value if loop_state: loopValues.append((token_value, tok.type_string)) if not loop_value_state: loop_value_state = True else: ci.setValue(token_value, tok.type_string) if loopValues != []: self._processLoop(cc, loopItems, loopValues) loopItems = [] loopValues = [] return cf
def _writeCifObj(self, cifObjIn, compress=False, mode='w'): """""" if self._handle is None: try: if compress: self._handle = gzip.open(cifObjIn.file_path + ".gz", mode) else: self._handle = openGzip(cifObjIn.file_path, mode) except Exception as err: print("CifFileWriter error: %s" % str(err)) print("Could not write mmCIF file (No output path/filename specified)") return for datablock in cifObjIn.getDataBlocks(): self._handle.write(self.DATABLOCK % datablock.getId()) for category in datablock.getCategories(): if not category.isTable: for item in category.getItems(): tag = (self.CAT_ITM % (category.getId(), item.name)) tag = tag.ljust(category._maxTagLength + 8) self._handle.write( tag + item.getFormattedValue() + "\n") else: self._handle.write(self.LOOP) table = [] colLen = None for item in category.getItems(): tag = (self.CAT_ITM % (category.getId(), item.name)) tag = tag.ljust(category._maxTagLength + 8) self._handle.write(tag + "\n") table.append(item.getFormattedValue()) if not colLen: colLen = len(item.value) self._handle.write(pretty_print(table, transpose=True)) self._handle.write('\n' + self.NEWLINE) # HANDLE SAVEFRAMES # for saveframe in datablock.getSaveFrames(): self._handle.write(self.SAVEFRAMESTART % saveframe.getId()) for category in saveframe.getCategories(): if not category.isTable: for item in category.getItems(): tag = ( self.CAT_ITM % (category.getId(), item.name)) tag = tag.ljust(category._maxTagLength + 8) self._handle.write( tag + item.getFormattedValue() + "\n") else: self._handle.write(self.LOOP) table = [] colLen = None for item in category.getItems(): tag = ( self.CAT_ITM % (category.getId(), item.name)) tag = tag.ljust(category._maxTagLength + 8) self._handle.write(tag + "\n") table.append(item.getFormattedValue()) if not colLen: colLen = len(item.value) self._handle.write(pretty_print(table, transpose=True)) self._handle.write('\n' + self.NEWLINE) self._handle.write(self.SAVEFRAMEEND) self._handle.flush()
def _parseFile(self, file_path, ignoreCategories, preserve_token_order, onlyCategories): """Private method that will do the work of parsing the mmCIF data file return Dictionary""" if preserve_token_order: try: from collections import OrderedDict as _dict except ImportError: # fallback: try to use the ordereddict backport when using python 2.6 try: from ordereddict import OrderedDict as _dict except ImportError: # backport not installed: use local OrderedDict from mmCif.ordereddict import OrderedDict as _dict else: _dict = dict mmcif_like_file = _dict() data_block = _dict() save_block = _dict() data_heading = "" line_num = 0 try: with openGzip(file_path, 'r') as f1: table_names = [] table_values = [] table_values_array = [] isLoop = False multiLineValue = False skipCategory = False for line in f1: line_num+=1 if skipCategory: flag = False while line: check = (line.strip().startswith('_') or self.loopRE.match(line.strip()[:5]) or self.saveRE.match(line.strip()[:5]) or self.dataRE.match(line.strip()[:5])) if flag: if check: isLoop = False break else: if not check: flag = True if not (self.saveRE.match(line.strip()[:5]) or self.dataRE.match(line.strip()[:5])): try: line = next(f1) line_num+=1 except StopIteration: break else: break skipCategory = False if isLoop is True and table_values_array != [] and (self.loopRE.match(line) is not None or (line.strip().startswith('_'))): isLoop = False num_item = len(table_names) if len(table_values_array) % num_item != 0: raise MMCIFWrapperSyntaxError(category) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[val_index::num_item] table_values_array = [] if line.strip() == "": continue if line.startswith('#'): continue if '\t#' in line or ' #' in line and not line.startswith(';'): new_line = '' for tok in self.dataValueRE.findall(line): if not tok.startswith('#'): new_line += tok+" " else: break # make sure to preserve the fact that ';' was not the first character line = new_line if not new_line.startswith(';') else " "+new_line # Fails for entries "3snv", "1kmm", "1ser", "2prg", "3oqd" # line = re.sub(r'\s#.*$', '', line) if line.startswith(';'): while '\n;' not in line: try: line += next(f1) line_num+=1 except StopIteration: break multiLineValue = True if self.dataRE.match(line): if data_block != {}: if table_values_array != []: isLoop = False num_item = len(table_names) if len(table_values_array) % num_item != 0: raise mmCifSyntaxError(category) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[val_index::num_item] table_names = [] table_values_array = [] mmcif_like_file[data_heading] = data_block data_block = _dict() data_heading = self.dataRE.match(line).group('data_heading') elif self.saveRE.match(line): while line.strip() != 'save_': try: line = next(f1) line_num+=1 except StopIteration: break continue elif self.loopRE.match(line): # Save and clear the table_values_array buffer from the # previous loop that was read if table_values_array != []: for itemIndex, name in enumerate(table_names): data_block[category].update({name:[row[itemIndex] for row in table_values_array]}) table_values_array = [] isLoop = True category, item, value = None, None, None #Stores items of a category listed in loop blocks table_names = [] #Stores values of items in a loop as a single row table_values = [] elif self.dataNameRE.match(line): # Match category and item simultaneously m = self.dataNameRE.match(line) category = m.group('data_category') item = m.group('category_item') remainder = m.group('remainder') value = None if isLoop and remainder != '': """Append any data values following the last loop category.item tag should any exist""" table_values += self._tokenizeData(remainder) line = '' else: line = remainder + "\n" if not isLoop: if line.strip() != '': value = self._tokenizeData(line) else: # For cases where values are on the following # line try: line = next(f1) line_num +=1 except StopIteration: break while value is None: char_start = 1 if line.startswith(';') else 0 while line.startswith(';') and not line.rstrip().endswith('\n;'): try: line += next(f1) line_num+=1 except StopIteration: break value = (line[char_start:line.rfind('\n;')]).strip() if char_start > 0: value = (line[char_start:line.rfind('\n;')]).strip() else: value = self._tokenizeData(" "+line) if (ignoreCategories and category in ignoreCategories) or (onlyCategories and category not in onlyCategories): pass else: if category in data_block: data_block[category].update({item: value if len(value) > 1 else value[0]}) else: data_block.setdefault(category, _dict({item: value if len(value) > 1 else value[0]})) # OrderedDict here preserves item order else: if (ignoreCategories and category in ignoreCategories) or (onlyCategories and category not in onlyCategories): skipCategory = True else: data_block.setdefault(category, _dict()) # OrderedDict here preserves item order table_names.append(item) else: if multiLineValue is True: table_values.append((line[1:line.rfind('\n;')]).strip()) multiLineValue = False line = line[line.rfind('\n;') + 2:] if line.strip() != '': table_values += self._tokenizeData(line) else: table_values += self._tokenizeData(line) if table_values != []: table_values_array += table_values table_values = [] if isLoop is True and table_values_array != []: isLoop = False num_item = len(table_names) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[val_index::num_item] table_values_array = [] if data_block != {}: mmcif_like_file[data_heading] = data_block return mmcif_like_file except KeyError as key_err: print("KeyError [line %i]: %s" %(line_num, str(key_err))) except IOError as io_err: print("IOException [line %i]: %s" % (line_num, str(io_err)))