def _get_value(xml, rtype=unicode): """Returns xml node value.""" # Get xml value. if isinstance(xml, types.ListType): xml = None if len(xml) == 0 else xml[0] if xml is None: return None # Get unicode. if rtype is unicode: if isinstance(xml, types.StringTypes): result = convert.str_to_unicode(xml) else: result = convert.str_to_unicode(et.tostring(xml)) else: if isinstance(xml, types.StringTypes): result = convert.unicode_to_str(xml) else: result = et.tostring(xml) # Format. result = result.strip() result = result.rstrip('|') return result
def read(fpath, encoding=None, decode=True): """Reads a document from file system. :param str fpath: Path to previously saved file. :param str encoding: Encoding to use during deserialization. :param bool decode: Flag indicating whether document will be decoded. :returns: A pyesdoc document instance. :rtype: object """ # Validate file path. if not os.path.isfile(fpath): raise IOError("Document file path does not exist") # Optionally derive encoding from file extension. if encoding is None: encoding = os.path.splitext(fpath)[1][1:] # Set raw content. with open(fpath, "r") as fstream: fcontent = str_to_unicode(fstream.read()) # Decode upon request. return pyesdoc.decode(fcontent, encoding) if decode else fcontent
def read(fpath, encoding=None, decode=True): """Reads a document from file system. :param str fpath: Path to previously saved file. :param str encoding: Encoding to use during deserialization. :param bool decode: Flag indicating whether document will be decoded. :returns: A pyesdoc document instance. :rtype: object """ fpath = os.path.expanduser(fpath) # Validate file path. if not os.path.isfile(fpath): raise IOError("Document file path does not exist") # Optionally derive encoding from file extension. if encoding is None: encoding = os.path.splitext(fpath)[1][1:] # Set raw content. with open(fpath, 'r') as fstream: fcontent = str_to_unicode(fstream.read()) # Decode upon request. return pyesdoc.decode(fcontent, encoding) if decode else fcontent
def execute(ctx): """Creates document index. :param object ctx: Document processing context information. """ # Instantiate. instance = models.Document() instance.description = str_to_unicode(ctx.doc.ext.description) instance.institute = ctx.doc.meta.institute instance.name = unicode(ctx.doc.ext.display_name) instance.project = ctx.doc.meta.project.strip().lower() if ctx.doc.meta.sub_projects: instance.sub_projects = ",".join([u"<{}>".format(i.lower()) for i in sorted(ctx.doc.meta.sub_projects)]) instance.typeof = unicode(ctx.doc.meta.type) instance.uid = unicode(ctx.doc.meta.id) instance.version = ctx.doc.meta.version # Set alternative name. if hasattr(ctx.doc, "alternative_name"): if ctx.doc.alternative_name: instance.alternative_name = ctx.doc.alternative_name elif hasattr(ctx.doc, "alternative_names"): if ctx.doc.alternative_names: instance.alternative_name = ctx.doc.alternative_names[0] # Set short/long names. fields = [f for f in ctx.doc.ext.summary_fields if f is not None] try: instance.canonical_name = fields[0] except IndexError: pass try: instance.long_name = fields[1] except IndexError: pass # Set other fields. try: parser = _PARSERS[type(ctx.doc)] except KeyError: pass else: parser(instance, ctx.doc) # Persist. try: session.insert(instance) except sqlalchemy.exc.IntegrityError: session.rollback() print instance.uid, instance.version, instance.typeof raise StopIteration("Document already ingested") else: ctx.primary = instance
def _format(s): if s is None: s = None # TODO add support for time formatting. elif isinstance(v, datetime.datetime): s = str(s)[:10] else: s = str(s) if s and len(s): s = convert.str_to_unicode(s) if output_formatter: s = output_formatter(s) if s and len(s): s = s.strip() return s
def decode(as_json): """Decodes a document from a UTF-8 encoded json text blob. :param as_json: Document json representation. :type as_json: unicode | str :returns: A pyesdoc document instance. :rtype: object """ # Convert to unicode. as_json = convert.str_to_unicode(as_json) # Convert to dictionary. as_dict = convert.json_to_dict(as_json) # Decode from dictionary. return dict_decoder.decode(as_dict)
def _encode_simple(xml, val): """Encodes a simple value. """ # Format according to type. if val in (None, 'None'): return u'' elif isinstance(val, datetime.datetime): val = val.isoformat().replace('T', ' ') elif isinstance(val, datetime.date): val = val.isoformat() elif isinstance(val, datetime.time): val = val.isoformat() else: val = convert.str_to_unicode(val) if val is None or len(val) == 0: val = u'' # Assign to xml. xml.text = val.strip()