Beispiel #1
0
    def process_document(self, session, doc):
        cmd = "file -i -b %INDOC%"
        (qq, infn) = tempfile.mkstemp()
        os.close(qq)
        fh = open(infn, 'w')
        fh.write(doc.get_raw(session))
        fh.close()
        cmd = cmd.replace("%INDOC%", infn)
        res = getShellResult(cmd)
        mt = res.strip()
        if mt.find(';') > -1:
            bits = mt.split(';')
            mt = bits[0]
            for b in bits[1:]:
                # just stuff them on doc for now
                (type, value) = b.split('=')
                setattr(doc, type, value)

        if mt == "text/plain":
            # Might be sgml, xml, text etc
            res = getShellResult("file -b {0}".format(infn))
            mt2 = res.strip()
            if mt2 == "exported SGML document text":
                mt = "text/sgml"
            elif mt2 == "XML document text":
                mt = "text/xml"
            # Others include java, etc. but not very useful to us
        doc.mimeType = mt
        doc.processHistory.append(self.id)
        return doc
Beispiel #2
0
    def process_document(self, session, doc):

        cmd = "file -i -b %INDOC%"
        (qq, infn) = tempfile.mkstemp()
        os.close(qq)
        fh = open(infn, 'w')
        fh.write(doc.get_raw(session))
        fh.close()
        cmd = cmd.replace("%INDOC%", infn)
        res = getShellResult(cmd)
        mt = res.strip()

        if mt.find(';') > -1:
            bits = mt.split(';')
            mt = bits[0]
            for b in bits[1:]:
                # just stuff them on doc for now
                (type, value) = b.split('=')
                setattr(doc, type, value)

        if mt == "text/plain":
            # we might be sgml, xml, text etc
            res = getShellResult("file -b {0}".format(infn))
            mt2 = res.strip()
            if mt2 == "exported SGML document text":
                mt = "text/sgml"
            elif mt2 == "XML document text":
                mt = "text/xml"
            # others include java, etc. but not very useful to us

        doc.mimeType = mt
        return doc
 def find_documents(self, session, cache=0):
     fl = getShellResult("locate %s | grep %s$" %
                         (self.stream, self.stream))
     docs = fl.split('\n')
     while docs and docs[0][:8] == "warning:":
         docs.pop(0)
     self._processFiles("", docs, cache)
Beispiel #4
0
 def __init__(self, session, node, parent):
     tp = self.get_path(session, 'executablePath', '')
     exe = self.get_path(session, 'executable', 'enju')
     if not tp:
         tp = getShellResult('which %s' % exe)
         tp = tp if not tp.startswith('which:') else exe
     else:
         tp = os.path.join(tp, exe)
     xml = self.get_setting(session, 'xml', 1)
     if xml:
         cmd = "%s -xml" % tp
     else:
         cmd = tp
     self.pipe = Popen(cmd,
                       shell=True,
                       bufsize=1,
                       stdin=PIPE,
                       stdout=PIPE,
                       stderr=PIPE)
     l = ""
     while l != 'Ready\n':
         # Check for errors with command
         if "command not found" in l:
             self.log_error(
                 session, "Error while initializing EnjuObject: "
                 "{0}".format(l.strip()))
             break
         l = self.pipe.stderr.readline()
Beispiel #5
0
    def __init__(self, session, node, parent):
        self.unparsedOutput = self.get_setting(session, 'parseOutput', 0)
        tp = self.get_path(session, 'executablePath', '')
        exe = self.get_path(session, 'executable', 'geniatagger')
        if not tp:
            tp = getShellResult('which %s' % exe)
            tp = os.path.dirname(tp)
        tpe = os.path.join(tp, exe)
        if not tp:
            raise ConfigFileException("%s requires the path: "
                                      "executablePath" % self.id)
        o = os.getcwd()
        os.chdir(tp)
        if self.get_setting(session, 'tokenize', 0):
            cmd = exe
        else:
            cmd = "%s -nt" % exe
        self.pipe = Popen(cmd,
                          shell=True,
                          bufsize=1,
                          stdin=PIPE,
                          stdout=PIPE,
                          stderr=PIPE)

        l = ""
        while l != 'loading named_entity_models..done.\n':
            l = self.pipe.stderr.readline()
        os.chdir(o)
Beispiel #6
0
 def __init__(self, session, node, parent):
     tp = self.get_path(session, 'executablePath', '')
     exe = self.get_path(session, 'executable', 'enju')
     if not tp:
         tp = getShellResult('which %s' % exe)
         tp = tp if not tp.startswith('which:') else exe
     else:
         tp = os.path.join(tp, exe)
     xml = self.get_setting(session, 'xml', 1)
     if xml:
         cmd = "%s -xml" % tp
     else:
         cmd = tp
     self.pipe = Popen(cmd, shell=True, bufsize=1,
                       stdin=PIPE, stdout=PIPE, stderr=PIPE)
     l = ""
     while l != 'Ready\n':
         # Check for errors with command
         if "command not found" in l:
             self.log_error(session,
                            "Error while initializing EnjuObject: "
                            "{0}".format(l.strip()))
             break
         l = self.pipe.stderr.readline()
Beispiel #7
0
    def __init__(self, session, node, parent):
        self.unparsedOutput = self.get_setting(session, 'parseOutput', 0)
        tp = self.get_path(session, 'executablePath', '')
        exe = self.get_path(session, 'executable', 'geniatagger')
        if not tp:
            tp = getShellResult('which %s' % exe)
            tp = os.path.dirname(tp)
        tpe = os.path.join(tp, exe)
        if not tp:
            raise ConfigFileException("%s requires the path: "
                                      "executablePath" % self.id)
        o = os.getcwd()
        os.chdir(tp)
        if self.get_setting(session, 'tokenize', 0):
            cmd = exe
        else:
            cmd = "%s -nt" % exe
        self.pipe = Popen(cmd, shell=True, bufsize=1,
                          stdin=PIPE, stdout=PIPE, stderr=PIPE)

        l = ""
        while l != 'loading named_entity_models..done.\n':
            l = self.pipe.stderr.readline()
        os.chdir(o)
Beispiel #8
0
    def process_document(self, session, doc):
        """Pass Document to executable, add results to document metadata."""
        cmd = self.cmd
        stdIn = cmd.find('%INDOC%') == -1
        stdOut = cmd.find('%OUTDOC%') == -1
        if not stdIn:
            if doc.mimeType or doc.filename:
                # guess our extn~n
                try:
                    suff = mimetypes.guess_extension(doc.mimeType)
                except:
                    suff = ''
                if not suff:
                    suff = mimetypes.guess_extension(doc.filename)
                if suff:
                    (qq, infn) = tempfile.mkstemp(suff)
                else:
                    (qq, infn) = tempfile.mkstemp()
            else:
                (qq, infn) = tempfile.mkstemp()
            os.close(qq)
            fh = file(infn, 'w')
            fh.write(doc.get_raw(session))
            fh.close()
            cmd = cmd.replace("%INDOC%", infn)

        if not stdOut:
            if self.outMimeType:
                # guess our extn~n
                suff = mimetypes.guess_extension(self.outMimeType)
                (qq, outfn) = tempfile.mkstemp(suff)
            else:
                (qq, outfn) = tempfile.mkstemp()
            cmd = cmd.replace("%OUTDOC%", outfn)
            os.close(qq)

        if self.working:
            old = os.getcwd()
            os.chdir(self.working)
        else:
            old = ''

        if stdIn:
            pipe = subprocess.Popen(cmd,
                                    bufsize=0,
                                    shell=True,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            pipe.stdin.write(doc.get_raw(session))
            pipe.stdin.close()
            result = pipe.stdout.read()
            pipe.stdout.close()
            pipe.stderr.close()
            del pipe
        else:
            # result will read stdout+err regardless
            result = getShellResult(cmd)
            os.remove(infn)
            if not stdOut:
                if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
                    ofh = open(outfn)
                else:
                    # command probably added something to the end
                    # annoying
                    matches = glob.glob(outfn + "*")
                    for m in matches:
                        if os.path.getsize(m) > 0:
                            ofh = open(m)
                            break
                result = ofh.read()
                ofh.close()
                os.remove(outfn)
            # strip input filename from result if present (this is a tempfile so the name is useless)
            if result.startswith(infn):
                result = re.sub('^%s\s*[:-]?\s*' % (infn), '', result)
        if old:
            os.chdir(old)
        try:
            doc.metadata[self.metadataType].update(
                self._processResult(session, result))
        except:
            doc.metadata[self.metadataType] = self._processResult(
                session, result)

        if 'analysisDateTime' not in doc.metadata[self.metadataType]:
            doc.metadata[self.metadataType][
                'analysisDateTime'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
        return doc
Beispiel #9
0
    def __init__(self, session, config, parent=None):
        """Constructor inherited by all configured Cheshire3 objects.

        The constructor for all Cheshire3 objects take the same arguments:
        session:  A Session object
        topNode:  The <config> or <subConfig> domNode for the configuration
        parent:   The object that provides the scope for this object.
        """

        self.docstring = ""
        self.parent = parent
        self.subConfigs = CaselessDictionary()
        self.paths = {}
        self.objects = CaselessDictionary()
        self.settings = {}
        self.defaults = {}
        self.permissionHandlers = {}
        self.unresolvedObjects = {}
        self.functionLogger = None
        self._objectRefs = []
        self._includeConfigStores = []
        self.logger = None
        self.checkSums = {}
        self.pathCheckSums = {}

        self.version = ""
        self.complexity = ""
        self.stability = ""

        self.initTime = time.time()

        pathObjects = {}

        # LXML
        if hasattr(config, 'attrib'):
            self.id = config.attrib.get('id', '')
            self.version = config.attrib.get('version', '')
            self.complexity = config.attrib.get('complexity', '')
            self.stability = config.attrib.get('stability', '')

            walker = config.iterchildren(tag=etree.Element)
            for e in walker:
                if e.tag in ['name', '{%s}name' % CONFIG_NS]:
                    self.name = e.text
                elif e.tag in ['objectType', '{%s}objectType' % CONFIG_NS]:
                    self.objectType = e.text
                elif e.tag in ['checkSums', '{%s}checkSums' % CONFIG_NS]:
                    for e2 in e.iterchildren(tag=etree.Element):
                        # Store checksum on self, and hash code against it
                        pt = e2.attrib.get('pathType', '__code__')
                        ct = e2.attrib.get('type', 'md5')
                        if pt != '__code__':
                            try:
                                self.pathCheckSums[pt].append((ct, e2.text))
                            except KeyError:
                                self.pathCheckSums[pt] = [(ct, e2.text)]
                        else:
                            self.checkSums[ct] = e2.text

                elif e.tag in ['paths', '{%s}paths' % CONFIG_NS]:
                    for e2 in e.iterchildren(tag=etree.Element):
                        try:
                            typ = e2.attrib['type']
                        except KeyError:
                            raise ConfigFileException("path must have type")
                        if e2.tag in ['path', '{%s}path' % CONFIG_NS]:
                            # Allow template strings in paths
                            # e.g. ${cheshire3Home}/foo/bar
                            pathTmpl = Template(e2.text)
                            sub = pathTmpl.safe_substitute
                            self.paths[typ] = sub(cheshire3Paths)
                        elif e2.tag in ['object', '{%s}object' % CONFIG_NS]:
                            try:
                                ref = e2.attrib['ref']
                            except KeyError:
                                msg = "object must have ref"
                                raise ConfigFileException(msg)
                            pathObjects[typ] = ref
                elif e.tag in ['subConfigs', '{%s}subConfigs' % CONFIG_NS]:
                    # Recurse
                    self._recurseLxmlSubConfigs(session, e)
                elif e.tag in ['options', '{%s}options' % CONFIG_NS]:
                    for e2 in e.iterchildren(tag=etree.Element):
                        try:
                            typ = e2.attrib['type']
                        except KeyError:
                            msg = "option (setting/default) must have type"
                            raise ConfigFileException(msg)
                        if e2.tag in ['setting', '{%s}setting' % CONFIG_NS]:
                            value = self._verifySetting(typ, e2.text)
                            self.settings[typ] = value
                        elif e2.tag in ['default', '{%s}default' % CONFIG_NS]:
                            value = self._verifyDefault(typ, e2.text)
                            self.defaults[typ] = value
                elif e.tag in ['actions', '{%s}actions' % CONFIG_NS]:
                    pass
                elif e.tag in ['docs', '{%s}docs' % CONFIG_NS]:
                    self.docstring = e.text
                else:
                    self._handleLxmlConfigNode(session, e)

            del walker

        else:
            if (config.hasAttributeNS(None, 'id')):
                self.id = config.getAttributeNS(None, 'id')

            for child in config.childNodes:
                if child.nodeType == elementType:
                    if child.localName == "name":
                        self.name = getFirstData(child)
                    elif (child.localName == "objectType"):
                        self.objectType = getFirstData(child)
                    elif (child.localName == "paths"):
                        # Configure self with paths
                        for child2 in child.childNodes:
                            if child2.nodeType == elementType:
                                type = child2.getAttributeNS(None, 'type')
                                if child2.localName == "path":
                                    value = getFirstData(child2)
                                    # Allow template strings in paths
                                    # e.g. ${cheshire3Home}/foo/bar
                                    pathTmpl = Template(value)
                                    sub = pathTmpl.safe_substitute
                                    self.paths[type] = sub(cheshire3Paths)
                                elif child2.localName == "object":
                                    value = child2.getAttributeNS(None, 'ref')
                                    pathObjects[type] = value
                    elif (child.localName == "subConfigs"):
                        # Pointers to dom nodes for config ids
                        self._recurseSubConfigs(session, child)

                    elif (child.localName == "objects"):
                        for obj in child.childNodes:
                            if (
                                obj.nodeType == elementType and
                                obj.localName == "path"
                            ):
                                type = obj.getAttributeNS(None, 'type')
                                id = obj.getAttributeNS(None, 'ref')
                                self._objectRefs.append((id, type))
                    elif (child.localName == "options"):
                        # See configInfo in ZeeRex
                        for child2 in child.childNodes:
                            if (child2.nodeType == elementType):
                                type = child2.getAttributeNS(None, 'type')
                                if (child2.localName == "setting"):
                                    dc = getFirstData(child2)
                                    if (dc):
                                        value = self._verifySetting(type, dc)
                                        self.settings[type] = value
                                elif (child2.localName == "default"):
                                    dc = getFirstData(child2)
                                    if (dc):
                                        value = self._verifyDefault(type, dc)
                                        self.defaults[type] = value
                    elif (child.localName == "actions"):
                        # Permission rqmts
                        for child2 in child.childNodes:
                            if child2.nodeType == elementType:
                                p = PermissionHandler(child2, self)
                                self.permissionHandlers[p.actionIdentifier] = p
                    elif (child.localName == "docs"):
                        # Add per configuration documentation to docs stack.
                        self.docstring = getFirstData(child)
                    else:
                        self._handleConfigNode(session, child)

        if ('pythonPath' in self.paths):
            sys.path.append(self.paths['pythonPath'][1])

        # Allow any object to be set to debug
        # Functionality of this dependent on object
        self.debug = self.get_setting(session, "debug", 0)

        for p in self.permissionHandlers.keys():
            if p[0:5] == 'c3fn:':
                self.add_auth(p[5:])

        # Dynamically Instantiate objects. This is mindbending :}
        # Mindbending2: JIT building!
        if self.parent:
            self.parent.objects[self.id] = self
        for o in (self._objectRefs):
            # Instantiate
            obj = self.get_object(session, o[0])

        # Add default Object types to paths
        for t in pathObjects.keys():
            self.unresolvedObjects[t] = pathObjects[t]

        # Built, maybe set function logging
        log = self.get_setting(session,
                               'log',
                               session.server.defaultFunctionLog)
        if log:
            fl = self.get_path(session, 'functionLogger')
            if fl != self:
                self.functionLogger = fl
                logList = log.strip().split()
                for l in logList:
                    self.add_logging(session, l)
                try:
                    del self.settings['log']
                except KeyError:
                    # from default
                    pass

        # now checksum self
        if self.checkSums:
            code = inspect.getsource(self.__class__)
            for (ct, val) in self.checkSums.items():
                m = hashlib.new(ct)
                m.update(code)
                digest = m.hexdigest()
                if digest != val:
                    raise IntegrityException(self.id + ": " + digest)

        if self.pathCheckSums:
            # step through each referenced file and check
            for (pt, chk) in self.pathCheckSums.items():
                for (ct, val) in chk:
                    m = hashlib.new(ct)
                    # read in file
                    fn = self.get_path(session, pt)
                    if not os.path.isabs(fn):
                        if pt == 'executable':
                            # search
                            dp = self.get_path('session', 'executablePath', '')
                            if not dp:
                                dp = getShellResult('which {0}'.format(fn))

                        else:
                            dp = self.get_path(session, 'defaultPath')
                        fn = os.path.join(dp, fn)
                    fh = file(fn)
                    data = fh.read()
                    fh.close()

                    m.update(data)
                    digest = m.hexdigest()
                    if digest != val:
                        msg = "%s/%s (%s): %s" % (self.id, pt, fn, digest)
                        raise IntegrityException(msg)
Beispiel #10
0
 def process_document(self, session, doc):
     """Pass Document to executable, add results to document metadata."""
     cmd = self.cmd
     stdIn = cmd.find('%INDOC%') == -1
     stdOut = cmd.find('%OUTDOC%') == -1
     if not stdIn:
         if doc.mimeType or doc.filename:
             # guess our extn~n                
             try: suff = mimetypes.guess_extension(doc.mimeType)
             except: suff = ''
             if not suff:
                 suff = mimetypes.guess_extension(doc.filename)
             if suff:
                 (qq, infn) = tempfile.mkstemp(suff)
             else:
                 (qq, infn) = tempfile.mkstemp()                    
         else:
             (qq, infn) = tempfile.mkstemp()
         os.close(qq)
         fh = file(infn, 'w')
         fh.write(doc.get_raw(session))
         fh.close()
         cmd = cmd.replace("%INDOC%", infn)
         
     if not stdOut:
         if self.outMimeType:
             # guess our extn~n
             suff = mimetypes.guess_extension(self.outMimeType)
             (qq, outfn) = tempfile.mkstemp(suff)
         else:
             (qq, outfn) = tempfile.mkstemp()
         cmd = cmd.replace("%OUTDOC%", outfn)               
         os.close(qq)
     
     if self.working:
         old = os.getcwd()
         os.chdir(self.working)            
     else:
         old = ''
         
     if stdIn:
         pipe = subprocess.Popen(cmd, bufsize=0, shell=True,
                      stdin=subprocess.PIPE, 
                      stdout=subprocess.PIPE, 
                      stderr=subprocess.PIPE)
         pipe.stdin.write(doc.get_raw(session))
         pipe.stdin.close()
         result = pipe.stdout.read()
         pipe.stdout.close()
         pipe.stderr.close()
         del pipe
     else:
         # result will read stdout+err regardless
         result = getShellResult(cmd)
         os.remove(infn)
         if not stdOut:
             if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
                 ofh = open(outfn)
             else:
                 # command probably added something to the end
                 # annoying
                 matches = glob.glob(outfn + "*")
                 for m in matches:
                     if os.path.getsize(m) > 0:
                         ofh = open(m)
                         break
             result = ofh.read()
             ofh.close()
             os.remove(outfn)
         # strip input filename from result if present (this is a tempfile so the name is useless)
         if result.startswith(infn):
             result = re.sub('^%s\s*[:-]?\s*' % (infn), '', result)
     if old:
         os.chdir(old)
     try:
         doc.metadata[self.metadataType].update(self._processResult(session, result))
     except:
         doc.metadata[self.metadataType] = self._processResult(session, result)
         
     if 'analysisDateTime' not in doc.metadata[self.metadataType]:
         doc.metadata[self.metadataType]['analysisDateTime'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
     return doc
Beispiel #11
0
    def process_document(self, session, doc):
        cmd = self.cmd
        stdIn = cmd.find('%INDOC%') == -1
        stdOut = cmd.find('%OUTDOC%') == -1
        if not stdIn:
            # Create temp file for incoming data
            if doc.mimeType or doc.filename:
                # Guess our extn~n
                try:
                    suff = mimetypes.guess_extension(doc.mimeType)
                except:
                    suff = ''
                if not suff:
                    suff = mimetypes.guess_extension(doc.filename)
                    if not suff:
                        (foofn, suff) = os.path.splitext(doc.filename)
                if suff:
                    (qq, infn) = tempfile.mkstemp(suff)
                else:
                    (qq, infn) = tempfile.mkstemp()
            else:
                (qq, infn) = tempfile.mkstemp()

            os.close(qq)
            fh = open(infn, 'w')
            fh.write(doc.get_raw(session))
            fh.close()
            cmd = cmd.replace("%INDOC%", infn)
        if not stdOut:
            # Create temp file to outgoing data
            if self.outMimeType:
                # Guess our extn~n
                suff = mimetypes.guess_extension(self.outMimeType)
                (qq, outfn) = tempfile.mkstemp(suff)
            else:
                (qq, outfn) = tempfile.mkstemp()
            cmd = cmd.replace("%OUTDOC%", outfn)
            os.close(qq)

        if self.working:
            old = os.getcwd()
            os.chdir(self.working)
        else:
            old = ''

        if stdIn:
            pipe = subprocess.Popen(cmd,
                                    bufsize=0,
                                    shell=True,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            pipe.stdin.write(doc.get_raw(session))
            pipe.stdin.close()
            result = pipe.stdout.read()
            pipe.stdout.close()
            pipe.stderr.close()
            del pipe
        else:
            # Result will read stdout+err regardless
            result = getShellResult(cmd)
            os.remove(infn)
            if not stdOut:
                if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
                    fh = open(outfn)
                else:
                    # Command probably appended something to the filename
                    # Annoying! Have to glob for it
                    matches = glob.glob(outfn + "*")
                    # Or maybe ignored absolute path and put it in pwd...
                    matches2 = glob.glob(os.path.split(outfn)[-1] + '*')
                    for m in matches + matches2:
                        if os.path.getsize(m) > 0:
                            fh = open(m)
                            break
                try:
                    try:
                        result = fh.read()
                    except:
                        msg = '{0}: {1}'.format(cmd, result)
                        raise ExternalSystemException(msg)
                    else:
                        fh.close()
                finally:
                    os.remove(outfn)
                    try:
                        # Clean up when data written elsewhere
                        os.remove(fh.name)
                    except OSError:
                        pass
        if old:
            os.chdir(old)
        mt = self.outMimeType
        if not mt:
            mt = doc.mimeType
        return StringDocument(result,
                              self.id,
                              doc.processHistory,
                              mimeType=mt,
                              parent=doc.parent,
                              filename=doc.filename)
 def find_documents(self, session, cache=0):
     fl = getShellResult("locate %s | grep %s$" % (self.stream, self.stream))
     docs = fl.split('\n')
     while docs and docs[0][:8] == "warning:":
         docs.pop(0)
     self._processFiles("", docs, cache)
    def find_documents(self, session, cache=0):
        if cache == 1:
            # Can't store offsets as there's no file to offset to.
            raise NotImplementedError

        data = self.streamLocation
        sortx = self.factory.get_path(session, 'sortPath', None)
        if sortx == None:
            sortx = getShellResult('which sort')
        sorted = data + "_SORT"
        os.spawnl(os.P_WAIT, sortx, sortx, data, '-o', sorted)

        # Now construct cluster documents.
        doc = ["<cluster>"]
        f = file(sorted)
        l = f.readline()
        # term docid recstore occs (line, posn)*
        currKey = ""
        while (l):
            docdata = {}
            ldata = l.split('\x00')
            key = ldata[0]
            if (not key):
                # Data from records with no key
                l = f.readline()
                l = l[:-1]
                continue

            doc.append("<key>%s</key>\n" % (key))
            ldata = ldata[1:-1]
            for bit in range(len(ldata) / 2):
                d = docdata.get(ldata[bit * 2], [])
                d.append(ldata[bit * 2 + 1])
                docdata[ldata[bit * 2]] = d
            l = f.readline()
            l = l[:-1]
            ldata2 = l.split('\x00')
            key2 = ldata2[0]
            while key == key2:
                ldata2 = ldata2[1:-1]
                for bit in range(len(ldata2) / 2):
                    d = docdata.get(ldata2[bit * 2], [])
                    d.append(ldata2[bit * 2 + 1])
                    docdata[ldata2[bit * 2]] = d
                l = f.readline()
                l = l[:-1]
                ldata2 = l.split('\x00')
                key2 = ldata2[0]
            for k in docdata.keys():
                doc.append("<%s>" % (k))
                for i in docdata[k]:
                    doc.append("%s" % i)
                doc.append("</%s>" % (k))
            doc.append("</cluster>")
            sdoc = StringDocument(" ".join(doc))
            if cache == 0:
                yield sdoc
            else:
                self.documents.append(sdoc)

            doc = ["<cluster>"]
            l = f.readline()
            l = l[:-1]
        f.close()
Beispiel #14
0
def create_defaultConfig(identifier, args):
    """Create and return a generic database configuration.

    identifier := string
    args := argparse.Namespace
    """
    defaultPath = args.directory
    config = CONF.config(
        {'id': identifier,
         'type': 'database'},
        CONF.objectType("cheshire3.database.SimpleDatabase"),
        # <paths>
        CONF.paths(
            CONF.path({'type': "defaultPath"}, os.path.abspath(defaultPath)),
            # subsequent paths may be relative to defaultPath
            CONF.path({'type': "metadataPath"},
                      os.path.join('.cheshire3', 'stores', 'metadata.bdb')
                      ),
            CONF.object({'type': "recordStore",
                         'ref': "recordStore"}
                        ),
            CONF.object({'type': "protocolMap",
                         'ref': "cqlProtocolMap"}
                        ),
            CONF.path({'type': "indexStoreList"}, "indexStore"),
        ),
        CONF.subConfigs(
            # recordStore
            CONF.subConfig(
                {'type': "recordStore",
                 'id': "recordStore"},
                CONF.objectType("cheshire3.recordStore.BdbRecordStore"),
                CONF.paths(
                    CONF.path({'type': "defaultPath"},
                              os.path.join('.cheshire3', 'stores')
                              ),
                    CONF.path({'type': "databasePath"},
                              'recordStore.bdb'
                              ),
                    CONF.object({'type': "idNormalizer",
                                 'ref': "StringIntNormalizer"}
                                ),
                    CONF.object({'type': "inWorkflow",
                                 'ref': "XmlToLZ4Workflow"}
                                ),
                    CONF.object({'type': "outWorkflow",
                                 'ref': "LZ4ToLxmlWorkflow"}
                                ),
                ),
                CONF.options(
                    CONF.setting({'type': "digest"}, 'md5'),
                ),
            ),
            # indexStore
            CONF.subConfig(
                {'type': "indexStore",
                 'id': "indexStore"},
                CONF.objectType("cheshire3.indexStore.BdbIndexStore"),
                CONF.paths(
                    CONF.path({'type': "defaultPath"},
                              os.path.join('.cheshire3', 'indexes')
                              ),
                    CONF.path({'type': "tempPath"},
                              'temp'
                              ),
                    CONF.path({'type': "recordStoreHash"},
                              'recordStore'
                              ),
                )
            ),
            # protocolMap
            CONF.subConfig(
                {'type': "protocolMap",
                 'id': "cqlProtocolMap"},
                CONF.objectType("cheshire3.protocolMap.CQLProtocolMap"),
                CONF.paths(
                    CONF.path({'type': "zeerexPath"}, args.zeerexPath)
                ),
            ),
            # MagicRedirectPreParser
            # Over-ride default behavior to preParse generic file types to METS
            # so that it can be parsed and indexed as XML
            CONF.subConfig(
                {'type': "preParser",
                 'id': "MagicRedirectPreParser"},
                CONF.objectType("cheshire3.preParser.MagicRedirectPreParser"),
                CONF.hash(
                    CONF.object({'mimeType': "application/pdf",
                                 'ref': "PdfToMetsPreParserWorkflow"}
                                ),
                    CONF.object({'mimeType': "text/prs.fallenstein.rst",
                                 'ref': "ReSTToMetsPreParserWorkflow"}
                                ),
                    CONF.object({'mimeType': "text/plain",
                                 'ref': "TxtToMetsPreParserWorkflow"}
                                ),
                    CONF.object({'mimeType': "text/html",
                                 'ref': "HtmlToMetsPreParserWorkflow"}
                                ),
                    CONF.object({'mimeType': "*",
                                 'ref': "METSWrappingPreParser"}
                                ),
                ),
            ),
        ),
    )
    # Check sortPath and fix up if necessary
    serverSortPath = server.get_path(session, 'sortPath')
    if not os.path.exists(serverSortPath):
        # Attempt to fix locally for default IndexStore
        sortPath = getShellResult('which sort')
        if 'which: no sort in' not in sortPath:
            # Found a sort executable - can add to configuration
            storePathsNode = config.xpath(
                '//c3:subConfig[@id="indexStore"]/c3:paths',
                namespaces={'c3': CONFIG_NS}
            )[0]
            storePathsNode.append(
                CONF.path({'type': "sortPath"}, sortPath)
            )
    # Add database docs if provided
    if args.title and args.description:
        config.insert(0, CONF.docs("{0.title} - {0.description}".format(args)))
    elif args.title:
        config.insert(0, CONF.docs(args.title))
    elif args.description:
        config.insert(0, CONF.docs(args.description))
    return config
Beispiel #15
0
    def process_document(self, session, doc):
        cmd = self.cmd
        stdIn = cmd.find('%INDOC%') == -1
        stdOut = cmd.find('%OUTDOC%') == -1
        if not stdIn:
            if doc.mimeType or doc.filename:
                # guess our extn~n                
                try: suff = mimetypes.guess_extension(doc.mimeType)
                except: suff = ''
                if not suff:
                    suff = mimetypes.guess_extension(doc.filename)
                    if not suff:
                        (foofn, suff) = os.path.splitext(doc.filename)
                if suff:
                    (qq, infn) = tempfile.mkstemp(suff)
                else:
                    (qq, infn) = tempfile.mkstemp()                    
            else:
                (qq, infn) = tempfile.mkstemp()                 
            
            os.close(qq)
            fh = open(infn, 'w')
            fh.write(doc.get_raw(session))
            fh.close()
            cmd = cmd.replace("%INDOC%", infn)
        if not stdOut:
            if self.outMimeType:
                # guess our extn~n
                suff = mimetypes.guess_extension(self.outMimeType)
                (qq, outfn) = tempfile.mkstemp(suff)
            else:
                (qq, outfn) = tempfile.mkstemp()
            cmd = cmd.replace("%OUTDOC%", outfn)               
            os.close(qq)
        
        if self.working:
            old = os.getcwd()
            os.chdir(self.working)            
        else:
            old = ''
            
        if stdIn:
            pipe = subprocess.Popen(cmd, bufsize=0, shell=True,
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            pipe.stdin.write(doc.get_raw(session))
            pipe.stdin.close()
            result = pipe.stdout.read()
            pipe.stdout.close()
            pipe.stderr.close()
            del pipe
        else:
            # result will read stdout+err regardless
            result = getShellResult(cmd)
            os.remove(infn)
            if not stdOut:
                if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
                    fh = open(outfn)
                else:
                    # command probably added something to the end
                    # annoying
                    matches = glob.glob(outfn + "*")
                    # or maybe ignored absolute path and put it in pwd...
                    matches2 = glob.glob(os.path.split(outfn)[-1] + '*')
                    for m in matches + matches2:
                        if os.path.getsize(m) > 0:
                            fh = open(m)
                            break
                try:
                    try:
                        result = fh.read()
                    except:
                        raise ExternalSystemException('Error from command: {0} : {1}'.format(cmd, result))
                    else:
                        fh.close()
                finally:
                    os.remove(outfn)
                    try: os.remove(fh.name) # clean up when data was written somewhere other than outfn
                    except OSError: pass

        if old:
            os.chdir(old)

        mt = self.outMimeType
        if not mt:
            mt = doc.mimeType
        return StringDocument(result, self.id, doc.processHistory, mimeType=mt, parent=doc.parent, filename=doc.filename) 
Beispiel #16
0
def create_defaultConfig(identifier, args):
    """Create and return a generic database configuration.

    identifier := string
    args := argparse.Namespace
    """
    defaultPath = args.directory
    config = CONF.config(
        {
            'id': identifier,
            'type': 'database'
        },
        CONF.objectType("cheshire3.database.SimpleDatabase"),
        # <paths>
        CONF.paths(
            CONF.path({'type': "defaultPath"}, os.path.abspath(defaultPath)),
            # subsequent paths may be relative to defaultPath
            CONF.path({'type': "metadataPath"},
                      os.path.join('.cheshire3', 'stores', 'metadata.bdb')),
            CONF.object({
                'type': "recordStore",
                'ref': "recordStore"
            }),
            CONF.object({
                'type': "protocolMap",
                'ref': "cqlProtocolMap"
            }),
            CONF.path({'type': "indexStoreList"}, "indexStore"),
        ),
        CONF.subConfigs(
            # recordStore
            CONF.subConfig(
                {
                    'type': "recordStore",
                    'id': "recordStore"
                },
                CONF.objectType("cheshire3.recordStore.BdbRecordStore"),
                CONF.paths(
                    CONF.path({'type': "defaultPath"},
                              os.path.join('.cheshire3', 'stores')),
                    CONF.path({'type': "databasePath"}, 'recordStore.bdb'),
                    CONF.object({
                        'type': "idNormalizer",
                        'ref': "StringIntNormalizer"
                    }),
                    CONF.object({
                        'type': "inWorkflow",
                        'ref': "XmlToLZ4Workflow"
                    }),
                    CONF.object({
                        'type': "outWorkflow",
                        'ref': "LZ4ToLxmlWorkflow"
                    }),
                ),
                CONF.options(CONF.setting({'type': "digest"}, 'md5'), ),
            ),
            # indexStore
            CONF.subConfig({
                'type': "indexStore",
                'id': "indexStore"
            }, CONF.objectType("cheshire3.indexStore.BdbIndexStore"),
                           CONF.paths(
                               CONF.path({'type': "defaultPath"},
                                         os.path.join('.cheshire3',
                                                      'indexes')),
                               CONF.path({'type': "tempPath"}, 'temp'),
                               CONF.path({'type': "recordStoreHash"},
                                         'recordStore'),
                           )),
            # protocolMap
            CONF.subConfig(
                {
                    'type': "protocolMap",
                    'id': "cqlProtocolMap"
                },
                CONF.objectType("cheshire3.protocolMap.CQLProtocolMap"),
                CONF.paths(CONF.path({'type': "zeerexPath"}, args.zeerexPath)),
            ),
            # MagicRedirectPreParser
            # Over-ride default behavior to preParse generic file types to METS
            # so that it can be parsed and indexed as XML
            CONF.subConfig(
                {
                    'type': "preParser",
                    'id': "MagicRedirectPreParser"
                },
                CONF.objectType("cheshire3.preParser.MagicRedirectPreParser"),
                CONF.hash(
                    CONF.object({
                        'mimeType': "application/pdf",
                        'ref': "PdfToMetsPreParserWorkflow"
                    }),
                    CONF.object({
                        'mimeType': "text/prs.fallenstein.rst",
                        'ref': "ReSTToMetsPreParserWorkflow"
                    }),
                    CONF.object({
                        'mimeType': "text/plain",
                        'ref': "TxtToMetsPreParserWorkflow"
                    }),
                    CONF.object({
                        'mimeType': "text/html",
                        'ref': "HtmlToMetsPreParserWorkflow"
                    }),
                    CONF.object({
                        'mimeType': "*",
                        'ref': "METSWrappingPreParser"
                    }),
                ),
            ),
        ),
    )
    # Check sortPath and fix up if necessary
    serverSortPath = server.get_path(session, 'sortPath')
    if not os.path.exists(serverSortPath):
        # Attempt to fix locally for default IndexStore
        sortPath = getShellResult('which sort')
        if 'which: no sort in' not in sortPath:
            # Found a sort executable - can add to configuration
            storePathsNode = config.xpath(
                '//c3:subConfig[@id="indexStore"]/c3:paths',
                namespaces={'c3': CONFIG_NS})[0]
            storePathsNode.append(CONF.path({'type': "sortPath"}, sortPath))
    # Add database docs if provided
    if args.title and args.description:
        config.insert(0, CONF.docs("{0.title} - {0.description}".format(args)))
    elif args.title:
        config.insert(0, CONF.docs(args.title))
    elif args.description:
        config.insert(0, CONF.docs(args.description))
    return config
    def find_documents(self, session, cache=0):
        if cache == 1:
            # Can't store offsets as there's no file to offset to.
            raise NotImplementedError

        data = self.streamLocation
        sortx = self.factory.get_path(session, 'sortPath', None)
        if sortx == None:
            sortx = getShellResult('which sort')
        sorted = data + "_SORT"
        os.spawnl(os.P_WAIT, sortx, sortx, data, '-o', sorted)

        # Now construct cluster documents.
        doc = ["<cluster>"]
        f = file(sorted)
        l = f.readline()
        # term docid recstore occs (line, posn)*
        currKey = ""
        while(l):
            docdata = {}
            ldata = l.split('\x00')
            key = ldata[0]
            if (not key):
                # Data from records with no key
                l = f.readline()
                l = l[:-1]
                continue

            doc.append("<key>%s</key>\n" % (key))
            ldata = ldata[1:-1]
            for bit in range(len(ldata)/2):
                d = docdata.get(ldata[bit*2], [])                
                d.append(ldata[bit*2+1])
                docdata[ldata[bit*2]] = d
            l = f.readline()
            l = l[:-1]
            ldata2 = l.split('\x00')
            key2 = ldata2[0]
            while key == key2:   
                ldata2 = ldata2[1:-1]
                for bit in range(len(ldata2)/2):
                    d = docdata.get(ldata2[bit*2], [])
                    d.append(ldata2[bit*2+1])
                    docdata[ldata2[bit*2]] = d
                l = f.readline()
                l = l[:-1]
                ldata2 = l.split('\x00')
                key2 = ldata2[0]
            for k in docdata.keys():
                doc.append("<%s>" % (k))
                for i in docdata[k]:                    
                    doc.append("%s" % i)
                doc.append("</%s>" % (k))
            doc.append("</cluster>")
            sdoc = StringDocument(" ".join(doc))
            if cache == 0:
                yield sdoc
            else:
                self.documents.append(sdoc)

            doc = ["<cluster>"]            
            l = f.readline()
            l = l[:-1]
        f.close()