def fields_from_notes(self): normalized = {} parsed = archieml.loads(self.notes) for k, v in parsed.items(): normalized[slugify(k).replace('-', '_')] = v return normalized
def process_archieml(self): logging.info("Processing text content") text_url = f"https://docs.google.com/document/d/{self.code}/export?format=txt" text = requests.get(text_url).text \ .replace("http:","httpCOLON") \ .replace("https:", "httpsCOLON") self.details = archieml.loads(text) for key, value in self.details.items(): self.details[key] = value.replace("COLON", ":")
def parse_archieml(text): ''' Abstract all archieml preprocessing and parsing to this function ''' text = text.replace('\r', '') # Obliterate ALL of google's [a][b][c] comment annotations! text = re.sub(r'^\[[a-z]\].+$', '', text, flags=re.M) text = re.sub(r'\[[a-z]\]', '', text) # Undo some of the auto-capitalization google docs inflicts return {k.lower(): v for k,v in archieml.loads(text).items() if v}
def from_string(data): # use archieml-python parse to import data mpfile = MPFile.from_dict(RecursiveDict(archieml.loads(data))) # post-process internal representation of file contents for key in mpfile.document.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in mpfile.document: mpfile.document.insert_before( mpfile.document.keys()[0], (mp_level01_titles[0], RecursiveDict()) ) mpfile.document.rec_update(nest_dict( mpfile.document.pop(key), [ mp_level01_titles[0], root_key ] )) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = mpfile.document.pop(key) keys = [ root_key ] if isinstance(value, list): keys.append('table') mpfile.document.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k,v in mpfile.document[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1]+'_'):] pd_obj = read_csv(v) mpfile.document[root_key].pop(table_name) mpfile.document[root_key].rec_update(nest_dict( pandas_to_dict(pd_obj), [k] )) # make default plot (add entry in 'plots') for each # table, first column as x-column plots_dict = nest_dict( {'x': pd_obj.columns[0], 'table': table_name}, [mp_level01_titles[2], 'default_{}'.format(k)] ) if mp_level01_titles[2] in mpfile.document[root_key]: mpfile.document[root_key].rec_update(plots_dict) else: kv = ( mp_level01_titles[2], plots_dict[mp_level01_titles[2]] ) mpfile.document[root_key].insert_before(k, kv) return mpfile
def from_string(data): # use archieml-python parse to import data mpfile = MPFile.from_dict(RecursiveDict(archieml.loads(data))) # post-process internal representation of file contents for key in mpfile.document.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in mpfile.document: mpfile.document.insert_before( mpfile.document.keys()[0], (mp_level01_titles[0], RecursiveDict())) mpfile.document.rec_update( nest_dict(mpfile.document.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = mpfile.document.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') mpfile.document.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in mpfile.document[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) mpfile.document[root_key].pop(table_name) mpfile.document[root_key].rec_update( nest_dict(pandas_to_dict(pd_obj), [k])) # make default plot (add entry in 'plots') for each # table, first column as x-column plots_dict = nest_dict( { 'x': pd_obj.columns[0], 'table': table_name }, [mp_level01_titles[2], 'default_{}'.format(k)]) if mp_level01_titles[2] in mpfile.document[root_key]: mpfile.document[root_key].rec_update(plots_dict) else: kv = (mp_level01_titles[2], plots_dict[mp_level01_titles[2]]) mpfile.document[root_key].insert_before(k, kv) return mpfile
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(archieml.loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in rdct.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct.insert_before(rdct.keys()[0], (mp_level01_titles[0], RecursiveDict())) rdct.rec_update( nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') rdct.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in rdct[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) rdct[root_key].pop(table_name) rdct[root_key].rec_update( nest_dict(pd_obj.to_dict(), [k])) rdct[root_key].insert_default_plot_options(pd_obj, k) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in rdct[root_key]: from pymatgen.io.cif import CifParser for name in rdct[root_key][mp_level01_titles[3]].keys(): cif = rdct[root_key][mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] rdct[root_key][mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def _main(self): '''Downloads Google Doc contents and convert to JSON using ArchieML.''' print '\nDownloading docs...' total_json = {} stories_list = [] service = self._build_service() for doc_id in self.doc_ids: contents = self._get_file_contents(service, doc_id) contents_json = archieml.loads(contents) contents_json = self._parse_plain_text_for_html(contents_json) stories_list.append(contents_json) total_json['stories'] = stories_list self._write_json(total_json)
def _main(self): """Downloads Google Doc contents and convert to JSON using ArchieML.""" print "\nDownloading docs..." total_json = {} stories_list = [] service = self._build_service() for doc_id in self.doc_ids: contents = self._get_file_contents(service, doc_id) contents_json = archieml.loads(contents) contents_json = self._parse_plain_text_for_html(contents_json) stories_list.append(contents_json) total_json["stories"] = stories_list self._write_json(total_json)
def make_context(asset_depth=0): """ Create a base-context for rendering views. Includes app_config and JS/CSS includers. `asset_depth` indicates how far into the url hierarchy the assets are hosted. If 0, then they are at the root. If 1 then at /foo/, etc. """ context = flatten_app_config() try: with open(app_config.COPY_PATH) as f: # REMOVE the BOM. Google Docs downloaded as "text/plain" apparently save as UTF-8 with a BOM. # archieml-python will fail to parse first line of the document correctly if there's a BOM. data = f.read().decode('utf-8-sig').encode('utf-8') context['COPY'] = archieml.loads(data) except: pass context['JS'] = JavascriptIncluder(asset_depth=asset_depth) context['CSS'] = CSSIncluder(asset_depth=asset_depth) return context
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in list(rdct.keys()): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct[mp_level01_titles[0]] = RecursiveDict() rdct.move_to_end(mp_level01_titles[0], last=False) # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [mp_level01_titles[0]] if is_general else [] keys.append(root_key) if isinstance(value, list): keys.append("table") rdct.rec_update(nest_dict(value, keys)) # reference to section to iterate or parse as CIF section = (rdct[mp_level01_titles[0]][root_key] if is_general else rdct[root_key]) # iterate to find CSV sections to parse # also parse propnet quantities if isinstance(section, dict): scope = [] for k, v in section.iterate(): level, key = k key = "".join([replacements.get(c, c) for c in key]) level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if v is None: scope.append(key) elif isinstance(v, list) and isinstance(v[0], dict): table = "" for row_dct in v: table = "\n".join([table, row_dct["value"]]) pd_obj = read_csv(table) d = nest_dict(pd_obj.to_dict(), scope + [key]) section.rec_update(d, overwrite=True) if not is_general and level == 0: section.insert_default_plot_options(pd_obj, key) elif (Quantity is not None and isinstance(v, six.string_types) and " " in v): quantity = Quantity.from_key_value(key, v) d = nest_dict(quantity.as_dict(), scope + [key]) # TODO quantity.symbol.name section.rec_update(d, overwrite=True) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in section: from pymatgen.io.cif import CifParser for name in section[mp_level01_titles[3]].keys(): cif = section[mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] section[mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
self._copy[section] = Section(section, doc[section]) #c = Copy('data/copy.txt') #print dir(c) # for section in c: # print '---------------------' # print section # Open the file with open('data/copy.aml') as f: # REMOVE the BOM. Google Docs downloaded as "text/plain" apparently save as UTF-8 with a BOM. # Failing to remove the BOM will cause the first line of the document to be parsed incorrectly data = f.read().decode("utf-8-sig").encode("utf-8") a = archieml.loads(data) print a for section in a: print '---------------------' print section if type(a[section]) in [OrderedDict,list,dict]: for item in a[section]: print item