Ejemplo n.º 1
0
 def __init__(self, session, config, parent=None):
     try:
         # For more accurate error message, check if rdflib also missing
         super(OreRdfGraphParser,
               self).__init__(session, config, parent)
     except MissingDependencyException as e:
         raise MissingDependencyException(self.__class__.__name__,
                                          ['rdflib', 'foresite'])
     else:
         raise MissingDependencyException(self.objectType, 'foresite')
Ejemplo n.º 2
0
 def __init__(self, session, node, parent):
     SimpleExtractor.__init__(self, session, node, parent)
     if nltk is None:
         raise MissingDependencyException(self.objectType, 'nltk')
     # Load types from config
     types = self.get_setting(session, 'entityTypes')
     if types:
         self.types = []
         for type_ in types.split():
             type_ = type_.lower()
             if type_.startswith('pe'):
                 self.types.append('PERSON')
             elif type_.startswith(('pl', 'g')):
                 self.types.append('GPE')
             elif type_.startswith(('org', 'co')):
                 self.types.append('ORGANIZATION')
             else:
                 msg = ("Unknown entity type setting {0} on {1} {2}"
                        "".format(type_, self.__class__.__name__, self.id))
                 raise ConfigFileException(msg)
     else:
         # Default to all
         self.types = ['PERSON', 'GPE', 'ORGANIZATION']
     # Should we keep the /POS tag or strip it
     self.keepPos = self.get_setting(session, 'pos', 0)
Ejemplo n.º 3
0
    def __init__(self, session, stream):
        # Check for dependency
        if irods is None:
            raise MissingDependencyException(
                '{0.__module__}.{0.__class__.__name__}'.format(self),
                'irods (PyRods)'
            )
        # Check for URL
        if stream.startswith(('irods://', 'rods://')):
            myEnv = parse_irodsUrl(stream)
            stream = myEnv.relpath
        else:
            # Get parameters from env
            status, myEnv = irods.getRodsEnv()
        try:
            host = myEnv.getRodsHost()
            port = myEnv.getRodsPort()
            username = myEnv.getRodsUserName()
            zone = myEnv.getRodsZone()
            home = myEnv.getRodsHome()
        except AttributeError:
            host = myEnv.rodsHost
            port = myEnv.rodsPort
            username = myEnv.rodsUserName
            zone = myEnv.rodsZone
            home = myEnv.rodsHome
        conn, errMsg = irods.rcConnect(host, port, username, zone)
        status = irods.clientLogin(conn)
        if status:
            raise ConfigFileException("Cannot connect to iRODS: ({0}) {1}"
                                      "".format(status, errMsg)
                                      )

        c = irods.irodsCollection(conn)
        self.cxn = conn
        self.coll = c
        instream = stream
        # Check if abs path to home dir
        if stream.startswith(home):
            stream = stream[len(home):]
            if stream[0] == "/":
                stream = stream[1:]
        colls = stream.split('/')
        for i, cln in enumerate(colls):
            exit_status = c.openCollection(cln)
            if exit_status < 0:
                if (
                    (i < len(colls) - 1) or
                    (cln not in [obj[0] for obj in c.getObjects()])
                ):
                    raise IOError("When opening {0}: {1} does not exists in "
                                  "collection {2}".format(instream,
                                                          cln,
                                                          c.getCollName()
                                                          )
                                  )
Ejemplo n.º 4
0
def open_irodsUrl(url, mode='r'):
    """Open and return the file specified by an iRODS URL.

    Returns a file-like object - ``irods.IrodsFile``
    """
    if irods is None:
        raise MissingDependencyException("open_irodsUrl()", 'irods (PyRods)')
    parsed = parse_irodsUrl(url)
    conn, errMsg = irods.rcConnect(parsed.rodsHost, parsed.rodsPort,
                                   parsed.rodsUserName, parsed.rodsZone)
    status = irods.clientLogin(conn)
    return irods.irodsOpen(conn, parsed.path, mode)
Ejemplo n.º 5
0
 def __init__(self, session, config, parent):
     SimpleNormalizer.__init__(self, session, config, parent)
     if Stemmer is None:
         raise MissingDependencyException(self.objectType,
                                          "zopyx.txng3.ext"
                                          )
     lang = self.get_setting(session, 'language', 'english')
     try:
         self.stemmer = Stemmer.Stemmer(lang)
     except:
         raise ConfigFileException("Unknown stemmer language: "
                                   "%s" % (lang))
Ejemplo n.º 6
0
def main(argv=None):
    """Load data into a Cheshire3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid)
        )
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        # Allow for multiple data arguments
        docFac = db.get_object(session, 'defaultDocumentFactory')
        for dataArg in args.data:
            try:
                docFac.load(session,
                            dataArg,
                            args.cache,
                            args.format,
                            args.tagname,
                            args.codec
                            )
            except MissingDependencyException as e:
                server.log_critical(session, e.reason)
                missingDependencies = e.dependencies
                raise MissingDependencyException('cheshire3-load script',
                                                 missingDependencies
                                                 )
            wf = db.get_object(session, 'buildIndexWorkflow')
            wf.process(session, docFac)
Ejemplo n.º 7
0
 def __init__(self, session, config, parent):
     SimpleNormalizer.__init__(self, session, config, parent)
     if Stemmer is None:
         raise MissingDependencyException(self.objectType,
                                          "zopyx.txng3.ext"
                                          )
     lang = self.get_setting(session, 'language', 'english')
     self.punctuationRe = re.compile(
         "((?<!s)'|[-.,]((?=\s)|$)|(^|(?<=\s))[-.,']|"
         "[~`!@+=\#\&\^*()\[\]{}\\\|\":;<>?/])"
     )
     try:
         self.stemmer = Stemmer.Stemmer(lang)
     except:
         raise ConfigFileException("Unknown stemmer language: %s" %
                                   (lang))
Ejemplo n.º 8
0
def parse_irodsUrl(url):
    """Parse and iRODS URL, return a named tuple.

    Return value will have attributes:

    rodsHost
        Name of the iRODS host

    rodsPort
        Number of the port on which iRODS is served

    rodsZone
        Name of iRODS Zone

    path
        Absolute path of the file/collection

    rodsUserName
        iRODS username

    rodsHome
        iRODS home collection of given rodsUserName

    relpath
        Path relative to rodsHome

    """
    if irods is None:
        raise MissingDependencyException("parse_irodsUrl()", 'irods (PyRods)')
    IrodsUrl = namedtuple("IrodsUrl", [
        "rodsHost", "rodsPort", "rodsZone", "path", "rodsUserName", "rodsHome",
        "relpath"
    ],
                          verbose=False)
    parsed = urlsplit(url)
    pathParts = parsed.path.split('/')
    return IrodsUrl(
        parsed.hostname,
        parsed.port,
        pathParts[0],  # Zone
        parsed.path,  # Absolute path
        parsed.username,
        '/'.join(pathParts[:4]),  # Home
        '/'.join(pathParts[4:])  # Path relative to home
    )
Ejemplo n.º 9
0
 def __init__(self, session, node, parent):
     PosNormalizer.__init__(self, session, node, parent)
     if nltk is None:
         raise MissingDependencyException(self.objectType, 'nltk')
     cls = self.get_setting(session, 'taggerClass', None)
     if cls is not None:
         try:
             taggerClass = getattr(nltk.tag, cls)
         except AttributeError as e:
             raise ConfigFileException("nltk.tag does not define class "
                                       "{0} as specified in 'taggerClass' "
                                       "setting for {1} object with id {2}"
                                       "".format(cls,
                                                 self.__class__.__name__,
                                                 self.id))
         else:
             self.tagger = taggerClass()
     else:
         # Use standard tagger
         try:
             self.tagger = nltk.tag.load(nltk.tag._POS_TAGGER)
         except LookupError:
             nltk.download('maxent_treebank_pos_tagger')
     self.justPos = self.get_setting(session, 'justPos', 0)
Ejemplo n.º 10
0
def main(argv=None):
    """Load data into a Cheshire3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    if irods is None:
        raise MissingDependencyException('icheshire3-load script',
                                         'irods (PyRods)'
                                         )
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        # Allow for multiple data arguments
        docFac = db.get_object(session, 'defaultDocumentFactory')
        for dataArg in args.data:
            if dataArg.startswith('irods://'):
                parsed = urlsplit(dataArg)
            else:
                # Examine current environment
                status, myEnv = irods.getRodsEnv()
                try:
                    host = myEnv.getRodsHost()
                except AttributeError:
                    host = myEnv.rodsHost
                # Port
                try:
                    myEnv.getRodsPort()
                except AttributeError:
                    port = myEnv.rodsPort
                # User
                try:
                    username = myEnv.getRodsUserName()
                except AttributeError:
                    username = myEnv.rodsUserName
                netloc = '{0}@{1}:{2}'.format(username, host, port)
                try:
                    cqm = myEnv.getRodsCwd()
                except AttributeError:
                    cwd = myEnv.rodsCwd
                path = '/'.join([cwd, dataArg])
                parsed = SplitResult('irods', netloc, path, None, None)
                dataArg = urlunsplit(parsed)
            server.log_debug(session, dataArg)
            if args.format is None or not args.format.startswith('i'):
                fmt = 'irods'
            else:
                fmt = args.format
            server.log_debug(session, fmt)
            try:
                docFac.load(session, dataArg,
                            args.cache, fmt, args.tagname, args.codec)
            except MissingDependencyException as e:
                server.log_critical(session, e.reason)
                missingDependencies =  e.dependencies
                raise MissingDependencyException('cheshire3-load script',
                                                 missingDependencies)
            wf = db.get_object(session, 'buildIndexWorkflow')
            wf.process(session, docFac)
Ejemplo n.º 11
0
 def __init__(self, session, config, parent=None):
     Transformer.__init__(self, session, config, parent=parent)
     raise MissingDependencyException(self.objectType, "rdflib")
Ejemplo n.º 12
0
 def __init__(self, session, config, parent=None):
     BaseParser.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, 'pyRdfa')
Ejemplo n.º 13
0
 def __init__(self, session, config, parent):
     ClassificationPreParser.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, "bpnn")
Ejemplo n.º 14
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, 'rdflib')
Ejemplo n.º 15
0
 def __init__(self, session, config, parent):
     raise MissingDependencyException(self.__class__.__name__, "bzip2")
Ejemplo n.º 16
0
 def __init__(self, session, config, parent):
     SimpleWorkflow.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, "lucene")
Ejemplo n.º 17
0
 def __init__(self, session, config, parent):
     BaseParser.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, 'rdflib')
Ejemplo n.º 18
0
 def __init__(self, session, config, parent):
     IndexStore.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, "lucene")