def setUp(self):
        db_username_postgres = 'postgres'
        db_password_postgres = '1234'
        db_postgres_url = 'postgres://' + db_username_postgres + ':' + db_password_postgres + '@localhost/dev'

        path_to_database = path.join(path.curdir, "../databases")
        self.db_test = DAL(db_postgres_url, folder=path_to_database)
        self.db_test.import_table_definitions(path_to_database)
Example #2
0
	def __init__(self, path = None, isInit = False, isReset = False):
		self.isInit = isInit
		self.isReset = isReset
		
		if path is not None:
			self.db = DAL(ITM_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path)
		else:
			self.db = DAL(ITM_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit)
Example #3
0
	def __init__(self, path = None, isInit = False, isImport = False, isReset = False):
		self.isInit = isInit
		self.isImport = isImport
		self.isReset = isReset
		
		isInitOrImport = self.isInit or self.isImport
		if path is not None:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not isInitOrImport, migrate_enabled = isInitOrImport, folder = path)
		else:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not isInitOrImport, migrate_enabled = isInitOrImport)
Example #4
0
 def __init__(self, path=None, isInit=False):
     self.isInit = isInit
     if path is not None:
         self.db = DAL(BOW_DB.CONNECTION,
                       lazy_tables=not self.isInit,
                       migrate_enabled=self.isInit,
                       folder=path)
     else:
         self.db = DAL(BOW_DB.CONNECTION,
                       lazy_tables=not self.isInit,
                       migrate_enabled=self.isInit)
Example #5
0
class ITM_DB():
	FILENAME = 'itm.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DEFAULT_OPTIONS = {}
	
	def __init__(self, path = None, isInit = False, isReset = False):
		self.isInit = isInit
		self.isReset = isReset
		
		if path is not None:
			self.db = DAL(ITM_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path)
		else:
			self.db = DAL(ITM_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		if self.isReset:
			self.Reset()
		return self
	
	def __exit__(self, type, value, traceback):
		self.db.commit()
	
################################################################################

	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		if self.db(self.db.options).count() == 0:
			for key, value in ITM_DB.DEFAULT_OPTIONS.iteritems():
				self.db.options.insert( key = key, value = value )

	def SetOption(self, key, value):
		keyValue = self.db( self.db.options.key == key ).select().first()
		if keyValue:
			keyValue.update_record( value = value )
		else:
			self.db.options.insert( key = key, value = value )

	def GetOption(self, key):
		keyValue = self.db( self.db.options.key == key ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

	def Reset(self):
		pass
Example #6
0
    def __init__(self, path=None, isInit=False, isImport=False, isReset=False):
        self.isInit = isInit
        self.isImport = isImport
        self.isReset = isReset

        isInitOrImport = self.isInit or self.isImport
        if path is not None:
            self.db = DAL(Corpus_DB.CONNECTION,
                          lazy_tables=not isInitOrImport,
                          migrate_enabled=isInitOrImport,
                          folder=path)
        else:
            self.db = DAL(Corpus_DB.CONNECTION,
                          lazy_tables=not isInitOrImport,
                          migrate_enabled=isInitOrImport)
Example #7
0
def copy_db(env, db_name='db', db_link='sqlite:memory:'):
    try:
        from gluon.sql import DAL
    except ImportError:
        raise ImportError("""
No module named `gluon`, unable to find `sql.DAL`

Make sure you have the correct URI that is the main web2py directory.

This should include your `applications` and `gluon` directories.""")
    test_db = DAL(db_link)

    for tablename in env[db_name].tables:
        table_copy = [copy(f) for f in env[db_name][tablename]]
        test_db.define_table(tablename, *table_copy, migrate=True)

    return test_db
Example #8
0
def copy_db(env, db_name='db', db_link='sqlite:memory:'):
    try:
        from gluon.sql import DAL
    except ImportError:
        raise ImportError("""
No module named `gluon`, unable to find `sql.DAL`

Make sure you have the correct URI that is the main web2py directory.

This should include your `applications` and `gluon` directories.""")
    test_db = DAL(db_link)

    for tablename in env[db_name].tables:
        table_copy = [copy(f) for f in env[db_name][tablename]]
        test_db.define_table(tablename, *table_copy, migrate=True)

    return test_db
Example #9
0
	def setUp(self):
		db_username_postgres = 'postgres'
		db_password_postgres = '1234'
		db_postgres_url = 'postgres://' + db_username_postgres + ':' + db_password_postgres + '@localhost/dev'

		path_to_database = path.join(path.curdir, "../databases")
		self.db_test = DAL(db_postgres_url, folder=path_to_database)
		self.db_test.import_table_definitions(path_to_database)
class TestModuleSetup(unittest.TestCase):
    """docstring for TestModuleSetup"""
    def setUp(self):
        db_username_postgres = 'postgres'
        db_password_postgres = '1234'
        db_postgres_url = 'postgres://' + db_username_postgres + ':' + db_password_postgres + '@localhost/dev'

        path_to_database = path.join(path.curdir, "../databases")
        self.db_test = DAL(db_postgres_url, folder=path_to_database)
        self.db_test.import_table_definitions(path_to_database)

    def limpa_dados_tabela(self, nome_tabela):
        self.db_test.executesql('delete from ' + nome_tabela)
        self.db_test.commit()
Example #11
0
class TestModuleSetup(unittest.TestCase):
	"""docstring for TestModuleSetup"""
	def setUp(self):
		db_username_postgres = 'postgres'
		db_password_postgres = '1234'
		db_postgres_url = 'postgres://' + db_username_postgres + ':' + db_password_postgres + '@localhost/dev'

		path_to_database = path.join(path.curdir, "../databases")
		self.db_test = DAL(db_postgres_url, folder=path_to_database)
		self.db_test.import_table_definitions(path_to_database)

	def limpa_dados_tabela(self, nome_tabela):
		self.db_test.executesql('delete from ' + nome_tabela)
		self.db_test.commit()		
		
Example #12
0
	def __init__(self, path = None, isInit = False):
		self.isInit = isInit
		if path is not None:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path)
		else:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit)
Example #13
0
"""
This file is part of the Nervatura Framework
http://www.nervatura.com
Copyright © 2011-2015, Csaba Kappel
License: LGPLv3
http://www.nervatura.com/nerva2py/default/licenses
"""

if 0:
  global request; request = globals.Request()
  from gluon.globals import Session
  global session; session = Session()
  global response; response = globals.Response()
  import gluon.languages.translator as T
  from gluon.sql import DAL
  global db; db = DAL()
  from gluon.html import URL
  
from nerva2py.nervastore import NervaStore
from gluon.html import TABLE, TR, TD
from gluon.sqlhtml import SPAN, A

#postgres://username:password@localhost/database
#mysql://username:password@localhost/database
#sqlite://database.db

conStr="sqlite://demo.db"
ns = NervaStore(request, session, T, None)
ns.engine = "sqlite"
ns.connect.setConnect(uri=conStr, pool_size=0, createdb=False)
if ns.db:
Example #14
0
class MultipleLDA_DB():
	FILENAME = 'multiple_lda.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DEFAULT_OPTIONS = {
		'max_co_topic_count' : 10000       # Number of topic pairs to store
	}
	
	def __init__(self, path = None, isInit = False):
		self.isInit = isInit
		if path is not None:
			self.db = DAL(MultipleLDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path)
		else:
			self.db = DAL(MultipleLDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		self.DefineDimensionTables()
		self.DefineMatrixTables()
		self.DefineStatsTables()
		return self
	
	def __exit__(self, type, value, traceback):
		self.db.commit()
	
################################################################################

	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems():
				self.SetOption( key, value )

	def SetOption(self, key, value):
		where = self.db.options.key == key
		if self.db( where ).count() > 0:
			self.db( where ).update( value = value )
		else:
			self.db.options.insert( key = key, value = value )

	def GetOption(self, key):
		where = self.db.options.key == key
		keyValue = self.db( where ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

################################################################################

	def DefineDimensionTables(self):
		self.db.define_table( 'terms',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text' , 'string' , required = True, unique = True ),
			Field( 'term_freq' , 'double' , required = True ),
			Field( 'rank'      , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_text  ON terms (term_text);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS terms_freq  ON terms (term_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS terms_rank  ON terms (rank);' )
			
		self.db.define_table( 'docs',
			Field( 'doc_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'doc_id'   , 'string' , required = True, unique = True ),
			Field( 'doc_freq' , 'double' , required = True ),
			Field( 'rank'     , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_id    ON docs (doc_id);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS docs_freq  ON docs (doc_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS docs_rank  ON docs (rank);' )
			
		self.db.define_table( 'topics',
			Field( 'entry_index', 'integer'     , required = True, default = -1 ),
			Field( 'topic_index', 'integer'     , required = True, default = -1 ),
			Field( 'topic_freq' , 'double'      , required = True ),
			Field( 'topic_label', 'string'      , required = True ),
			Field( 'topic_desc' , 'string'      , required = True ),
			Field( 'top_terms'  , 'list:integer', required = True ),
			Field( 'top_docs'   , 'list:integer', required = True ),
			Field( 'rank'       , 'integer'     , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topics_indexes   ON topics (entry_index, topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_freq      ON topics (topic_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_rank      ON topics (rank);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_freqEntry ON topics (entry_index, topic_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_rankEntry ON topics (entry_index, rank);' )

	def DefineMatrixTables(self):
		self.db.define_table( 'term_topic_matrix',
			Field( 'entry_index', 'integer', required = True, default = -1 ),
			Field( 'term_index' , 'integer', required = True, default = -1 ),
			Field( 'topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes         ON term_topic_matrix (entry_index, term_index, topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_value           ON term_topic_matrix (value);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_rank            ON term_topic_matrix (rank);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_termindex       ON term_topic_matrix (term_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_topicindex      ON term_topic_matrix (topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_valueEntry      ON term_topic_matrix (entry_index, value);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_rankEntry       ON term_topic_matrix (entry_index, rank);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_termindexEntry  ON term_topic_matrix (entry_index, term_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS term_topic_topicindexEntry ON term_topic_matrix (entry_index, topic_index);' )
		
		self.db.define_table( 'doc_topic_matrix',
			Field( 'entry_index', 'integer', required = True, default = -1 ),
			Field( 'doc_index'  , 'integer', required = True, default = -1 ),
			Field( 'topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes         ON doc_topic_matrix (entry_index, doc_index, topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_value           ON doc_topic_matrix (value);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_rank            ON doc_topic_matrix (rank);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_docindex        ON doc_topic_matrix (doc_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_topicindex      ON doc_topic_matrix (topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_valueEntry      ON doc_topic_matrix (entry_index, value);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_rankEntry       ON doc_topic_matrix (entry_index, rank);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_docindexEntry   ON doc_topic_matrix (entry_index, doc_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS doc_topic_topicindexEntry ON doc_topic_matrix (entry_index, topic_index);' )

	def DefineStatsTables(self):
		self.db.define_table( 'topic_cossim',
			Field( 'first_entry_index' , 'integer', required = True, default = -1 ),
			Field( 'first_topic_index' , 'integer', required = True, default = -1 ),
			Field( 'second_entry_index', 'integer', required = True, default = -1 ),
			Field( 'second_topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_cossim_indexes ON topic_cossim (first_entry_index, first_topic_index, second_entry_index, second_topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_value       ON topic_cossim (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_rank        ON topic_cossim (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_valueFirst  ON topic_cossim (first_entry_index, first_topic_index, value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_rankFirst   ON topic_cossim (first_entry_index, first_topic_index, rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_valueSecond ON topic_cossim (second_entry_index, second_topic_index, value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_rankSecond  ON topic_cossim (second_entry_index, second_topic_index, rank);' )

		self.db.define_table( 'topic_kldiv',
			Field( 'first_entry_index' , 'integer', required = True, default = -1 ),
			Field( 'first_topic_index' , 'integer', required = True, default = -1 ),
			Field( 'second_entry_index', 'integer', required = True, default = -1 ),
			Field( 'second_topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_kldiv_indexes ON topic_kldiv (first_entry_index, first_topic_index, second_entry_index, second_topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_value       ON topic_kldiv (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_rank        ON topic_kldiv (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_valueFirst  ON topic_kldiv (first_entry_index, first_topic_index, value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_rankFirst   ON topic_kldiv (first_entry_index, first_topic_index, rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_valueSecond ON topic_kldiv (second_entry_index, second_topic_index, value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_rankSecond  ON topic_kldiv (second_entry_index, second_topic_index, rank);' )

		self.db.define_table( 'topic_rdp',
			Field( 'first_entry_index' , 'integer', required = True, default = -1 ),
			Field( 'first_topic_index' , 'integer', required = True, default = -1 ),
			Field( 'second_entry_index', 'integer', required = True, default = -1 ),
			Field( 'second_topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_rdp_indexes ON topic_rdp (first_entry_index, first_topic_index, second_entry_index, second_topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_value       ON topic_rdp (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_rank        ON topic_rdp (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_valueFirst  ON topic_rdp (first_entry_index, first_topic_index, value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_rankFirst   ON topic_rdp (first_entry_index, first_topic_index, rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_valueSecond ON topic_rdp (second_entry_index, second_topic_index, value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_rankSecond  ON topic_rdp (second_entry_index, second_topic_index, rank);' )

################################################################################

	def Reset(self):
		self.db.executesql( 'DELETE FROM terms;' )
		self.db.executesql( 'DELETE FROM docs;' )
		self.db.executesql( 'DELETE FROM topics;' )
		self.db.executesql( 'DELETE FROM term_topic_matrix;' )
		self.db.executesql( 'DELETE FROM doc_topic_matrix;' )
		self.db.executesql( 'DELETE FROM topic_cossim;' )
		self.db.executesql( 'DELETE FROM topic_kldiv;' )
		self.db.executesql( 'DELETE FROM topic_rdp;' )
Example #15
0
DEMO_MODE = False
response.google_analytics_id = None

if not request.env.web2py_runtime_gae:
    request.data_folder = None
    if os.path.isdir(os.path.join("..", "..", "data")):
        request.data_folder = os.path.join("..", "..", "data")
    elif os.path.isdir(os.path.join("..", "..", "databases")):
        request.data_folder = os.path.join("..", "..", "databases")
    elif os.path.isdir(os.path.join("..", "data")):
        request.data_folder = os.path.join("..", "data")
    elif os.path.isdir(os.path.join("..", "databases")):
        request.data_folder = os.path.join("..", "databases")

    ename = "sqlite"
    db = DAL("sqlite://storage.sqlite", migrate=False, fake_migrate=False, folder=request.data_folder)
    session_db = DAL("sqlite://session.sqlite", folder=request.data_folder)
    session.connect(request, response, db=session_db)
    reload(sys)
    sys.setdefaultencoding("utf-8")  # @UndefinedVariable
else:
    ename = "google_datastore"
    # db = DAL('google:datastore://storage', migrate=False, fake_migrate=False)
    db = DAL("google:datastore", adapter_args={"ndb_settings": None, "use_ndb": False})
    session.connect(request, response, db=db)
    # from gluon.contrib.memdb import MEMDB
    # from google.appengine.api.memcache import Client
    # session.connect(request, response, db = MEMDB(Client()))

response.generic_patterns = ["*"] if request.is_local else []
try:
Example #16
0
class Corpus_DB():
    FILENAME = 'corpus.db'
    CONNECTION = 'sqlite://{}'.format(FILENAME)
    DOC_IDS = ['doc_id', 'docid']
    DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body']
    DEFAULT_OPTIONS = {
        'token_regex': r'\w{3,}',
        'min_freq': 5,
        'min_doc_freq': 3,
        'max_freq_count': 4000,
        'max_co_freq_count': 160000,
        'max_g2_count': 160000
    }

    def __init__(self, path=None, isInit=False, isImport=False, isReset=False):
        self.isInit = isInit
        self.isImport = isImport
        self.isReset = isReset

        isInitOrImport = self.isInit or self.isImport
        if path is not None:
            self.db = DAL(Corpus_DB.CONNECTION,
                          lazy_tables=not isInitOrImport,
                          migrate_enabled=isInitOrImport,
                          folder=path)
        else:
            self.db = DAL(Corpus_DB.CONNECTION,
                          lazy_tables=not isInitOrImport,
                          migrate_enabled=isInitOrImport)

    def __enter__(self):
        self.DefineOptionsTable()
        self.DefineModelsTable()
        self.DefineCorpusTable()
        self.DefineMetadataTables()
        if self.isReset:
            self.Reset()
        self.DefineTermStatsTables()
        self.DefineTermCoStatsTables()
        self.DefineSentenceCoStatsTables()
        self.DefineTemporaryTable()
        return self

    def __exit__(self, type, value, traceback):
        self.DefineCorpusTextSearch()
        self.db.commit()

################################################################################

    def DefineOptionsTable(self):
        self.db.define_table('options',
                             Field('key', 'string', required=True,
                                   unique=True),
                             Field('value', 'string', required=True),
                             migrate=self.isImport)
        if self.isInit:
            for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems():
                keyValue = self.db(self.db.options.key == key).select().first()
                if keyValue:
                    keyValue.update_record(value=value)
                else:
                    self.db.options.insert(key=key, value=value)

    def SetOption(self, key, value):
        keyValue = self.db(self.db.options.key == key).select().first()
        if keyValue:
            keyValue.update_record(value=value)
        else:
            self.db.options.insert(key=key, value=value)

    def GetOption(self, key):
        keyValue = self.db(self.db.options.key == key).select(
            self.db.options.value).first()
        if keyValue:
            return keyValue.value
        else:
            return None

    def DefineModelsTable(self):
        self.db.define_table('models',
                             Field('model_key',
                                   'string',
                                   required=True,
                                   unique=True),
                             Field('model_desc', 'string', required=True),
                             migrate=self.isImport)

    def AddModel(self, model_key, model_desc):
        keyDesc = self.db(
            self.db.models.model_key == model_key).select().first()
        if keyDesc:
            keyDesc.update_record(model_desc=model_desc)
        else:
            self.db.models.insert(model_key=model_key, model_desc=model_desc)

    def GetModels(self):
        models = self.db(self.db.models).select(self.db.models.model_key)
        return [model.model_key for model in models]

    def GetModelDescription(self, model_key):
        model = self.db(self.db.models.model_key == model_key).select().first()
        if model:
            return model.model_desc
        else:
            return None

################################################################################

    def DefineCorpusTable(self):
        self.db.define_table('corpus',
                             Field('doc_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('doc_id',
                                   'string',
                                   required=True,
                                   unique=True),
                             Field('doc_content', 'text', required=True),
                             migrate=self.isImport)
        if self.isImport:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);'
            )
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id    ON corpus (doc_id);'
            )

    def DefineMetadataTables(self):
        self.db.define_table('fields',
                             Field('field_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('field_name',
                                   'string',
                                   required=True,
                                   unique=True),
                             Field('field_type', 'string', required=True),
                             migrate=self.isImport)
        if self.isImport:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);'
            )
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS field_name  ON fields (field_name);'
            )
            self.db.executesql(
                'CREATE        INDEX IF NOT EXISTS field_type  ON fields (field_type);'
            )

        self.db.define_table('metadata',
                             Field('doc_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('field_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'text', required=True),
                             migrate=self.isImport)
        if self.isImport:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes     ON metadata (doc_index, field_index);'
            )
            self.db.executesql(
                'CREATE        INDEX IF NOT EXISTS metadata_doc_index   ON metadata (doc_index);'
            )
            self.db.executesql(
                'CREATE        INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);'
            )

################################################################################

    def DefineCorpusTextSearch(self):
        if self.isImport:
            self.db.executesql('DROP TABLE IF EXISTS corpus_search;')
            self.db.executesql(
                'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);'
            )
            self.db.executesql(
                'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;'
            )

################################################################################

    def ImportFromFile(self, filename):
        """
		filename = A plain-text file (utf-8 encoded) containing one document per line
		"""
        def ReadFile():
            with open(filename, 'r') as f:
                for index, line in enumerate(f):
                    doc_index = index
                    values = line.decode('utf-8',
                                         'ignore').rstrip('\n').split('\t')
                    if len(values) == 1:
                        doc_id = 'doc{}'.format(doc_index + 1)
                        doc_content = values[0]
                    else:
                        doc_id = values[0]
                        doc_content = values[1]
                    yield {
                        'doc_index': doc_index,
                        'doc_id': doc_id,
                        'doc_content': doc_content.encode('utf-8')
                    }

        self.db.corpus.bulk_insert(ReadFile())

    def ImportFromFolder(self, glob_pattern):
        """
		glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file)
		"""
        def ReadFolder():
            filenames = sorted(glob.glob(glob_pattern))
            for index, filename in enumerate(filenames):
                doc_index = index
                doc_id = filename
                with open(filename, 'r') as f:
                    doc_content = f.read().decode('utf-8', 'ignore')
                    yield {
                        'doc_index': doc_index,
                        'doc_id': doc_id,
                        'doc_content': doc_content.encode('utf-8')
                    }

        self.db.corpus.bulk_insert(ReadFolder())

    def ImportFromSpreadsheet(self,
                              filename,
                              id_key=None,
                              content_key=None,
                              is_csv=False):
        """
		filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names)
		id_key = Name of the column containing unique document IDs
		content_key = Name of the column containing the document contents
		"""
        doc_id_keys = frozenset(
            [id_key] if id_key is not None else Corpus_DB.DOC_IDS)
        doc_content_keys = frozenset([content_key] if content_key is not None
                                     else Corpus_DB.DOC_CONTENTS)
        field_indexes = []
        field_names = []
        field_types = []
        metadata = []

        def ReadCSV():
            with open(filename, 'r') as f:
                reader = UnicodeReader(f)
                for row in reader:
                    yield row

        def ReadTSV():
            with open(filename, 'r') as f:
                for line in f:
                    yield line.decode('utf-8',
                                      'ignore').rstrip('\n').split('\t')

        def ReadSpreadsheet(reader):
            field_doc_id = None
            field_doc_content = None
            for row_index, values in enumerate(reader):
                if row_index == 0:
                    for index, field in enumerate(values):
                        if field.lower() in doc_id_keys:
                            field_doc_id = index
                        elif field.lower() in doc_content_keys:
                            field_doc_content = index
                        else:
                            field_index = len(field_indexes)
                            field_indexes.append(field_index)
                            field_names.append(field)
                            field_types.append('integer')
                else:
                    doc_index = row_index - 1
                    doc_id = 'doc{:d}'.format(doc_index + 1)
                    doc_content = ''
                    field_index = 0
                    for index, value in enumerate(values):
                        if field_doc_id == index:
                            doc_id = value
                        elif field_doc_content == index:
                            doc_content = value
                        else:
                            metadata.append({
                                'doc_index': doc_index,
                                'field_index': field_index,
                                'value': value.encode('utf-8')
                            })

                            # [START] infer field type
                            field_type = field_types[field_index]
                            if field_type == 'integer':
                                try:
                                    int(value)
                                except ValueError:
                                    field_type = 'double'
                            if field_type == 'double':
                                try:
                                    float(value)
                                except ValueError:
                                    field_type = 'string'
                            field_types[field_index] = field_type
                            # [END] infer field type

                            field_index += 1
                    yield {
                        'doc_index': doc_index,
                        'doc_id': doc_id,
                        'doc_content': doc_content.encode('utf-8')
                    }

        def GetFields():
            for field_index in field_indexes:
                field_name = field_names[field_index]
                field_type = field_types[field_index]
                yield {
                    'field_index': field_index,
                    'field_name': field_name,
                    'field_type': field_type
                }

        reader = ReadCSV() if is_csv else ReadTSV()
        self.db.corpus.bulk_insert(ReadSpreadsheet(reader))
        self.db.fields.bulk_insert(GetFields())
        self.db.metadata.bulk_insert(metadata)

    def ExportToFile(self, filename):
        """
		filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents
		"""
        def WriteFile(rows):
            m = re.compile(r'\s+')
            with open(filename, 'w') as f:
                for row in rows:
                    doc_id = row.doc_id
                    doc_content = m.sub(u' ', row.doc_content.decode('utf-8'))
                    f.write(u'{}\t{}\n'.format(doc_id,
                                               doc_content).encode('utf-8'))

        rows = self.db().select(self.db.corpus.doc_id,
                                self.db.corpus.doc_content,
                                orderby=self.db.corpus.doc_index)
        WriteFile(rows)

    def ExportToSpreadsheet(self, filename, is_csv=False):
        """
		filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata
		"""
        field_names = [
            row.field_name
            for row in self.db().select(self.db.fields.field_name,
                                        orderby=self.db.fields.field_index)
        ]
        field_count = len(field_names)
        all_field_names = ['doc_id', 'doc_content'] + field_names

        def WriteCSV(rows):
            m = re.compile(r'\s+')
            with open(filename, 'w') as f:
                writer = UnicodeWriter(f)
                writer.writerow(all_field_names)
                for row in rows:
                    doc_index = row.doc_index
                    doc_id = row.doc_id
                    doc_content = m.sub(u' ', row.doc_content.decode('utf-8'))
                    values = [u''] * field_count
                    for d in self.db(
                            self.db.metadata.doc_index == doc_index).select(
                                self.db.metadata.field_index,
                                self.db.metadata.value):
                        values[d.field_index] = m.sub(u' ',
                                                      d.value.decode('utf-8'))
                    all_values = [doc_id, doc_content] + values
                    writer.writerow(all_values)

        def WriteTSV(rows):
            m = re.compile(r'\s+')
            with open(filename, 'w') as f:
                f.write(u'{}\n'.format(
                    u'\t'.join(all_field_names)).encode('utf-8'))
                for row in rows:
                    doc_index = row.doc_index
                    doc_id = row.doc_id
                    doc_content = m.sub(u' ', row.doc_content.decode('utf-8'))
                    values = [u''] * field_count
                    for d in self.db(
                            self.db.metadata.doc_index == doc_index).select(
                                self.db.metadata.field_index,
                                self.db.metadata.value):
                        values[d.field_index] = m.sub(u' ',
                                                      d.value.decode('utf-8'))
                    all_values = [doc_id, doc_content] + values
                    f.write(u'{}\n'.format(
                        u'\t'.join(all_values)).encode('utf-8'))

        rows = self.db().select(self.db.corpus.doc_index,
                                self.db.corpus.doc_id,
                                self.db.corpus.doc_content,
                                orderby=self.db.corpus.doc_index)
        if is_csv:
            WriteCSV(rows)
        else:
            WriteTSV(rows)

################################################################################

    def DefineTermStatsTables(self):
        self.db.define_table('term_texts',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('term_text',
                                   'string',
                                   required=True,
                                   unique=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);'
            )

        self.db.define_table('term_freqs',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);'
            )

        self.db.define_table('term_probs',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);'
            )

        self.db.define_table('term_doc_freqs',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);'
            )

    def DefineTermCoStatsTables(self):
        self.db.define_table('term_co_freqs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);'
            )

        self.db.define_table('term_co_probs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);'
            )

        self.db.define_table('term_g2',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);')
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);')

    def DefineSentenceCoStatsTables(self):
        self.db.define_table('sentences_co_freqs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);'
            )

        self.db.define_table('sentences_co_probs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);'
            )

        self.db.define_table('sentences_g2',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);'
            )

################################################################################

    def Reset(self):
        self.db.executesql('DELETE FROM term_texts;')
        self.db.executesql('DELETE FROM term_freqs;')
        self.db.executesql('DELETE FROM term_probs;')
        self.db.executesql('DELETE FROM term_doc_freqs;')
        self.db.executesql('DELETE FROM term_co_freqs;')
        self.db.executesql('DELETE FROM term_co_probs;')
        self.db.executesql('DELETE FROM term_g2;')
        self.db.executesql('DELETE FROM sentences_co_freqs;')
        self.db.executesql('DELETE FROM sentences_co_probs;')
        self.db.executesql('DELETE FROM sentences_g2;')

################################################################################

    def DefineTemporaryTable(self):
        self.db.define_table('vocab',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('term_text', 'string', required=True),
                             migrate=self.isInit)
        self.db.define_table('vocab_text',
                             Field('term_text',
                                   'string',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             migrate=self.isInit)
Example #17
0
DEMO_MODE = False
response.google_analytics_id = None
  
if not request.env.web2py_runtime_gae:
  request.data_folder = None
  if os.path.isdir(os.path.join('..','..','data')):
    request.data_folder = os.path.join('..','..','data')
  elif os.path.isdir(os.path.join('..','..','databases')):
    request.data_folder = os.path.join('..','..','databases')
  elif os.path.isdir(os.path.join('..','data')):
    request.data_folder = os.path.join('..','data')
  elif os.path.isdir(os.path.join('..','databases')):
    request.data_folder = os.path.join('..','databases')

  ename="sqlite"
  db = DAL('sqlite://storage.sqlite', migrate=False, fake_migrate=False, folder=request.data_folder) 
  session_db = DAL('sqlite://session.sqlite', folder=request.data_folder)
  session.connect(request, response, db = session_db)
  reload(sys)
  sys.setdefaultencoding("utf-8")#@UndefinedVariable
else:
  ename="google_datastore"
  #db = DAL('google:datastore://storage', migrate=False, fake_migrate=False)
  db = DAL('google:datastore', adapter_args={'ndb_settings':None, 'use_ndb':False})
  session.connect(request, response, db = db)
  #from gluon.contrib.memdb import MEMDB
  #from google.appengine.api.memcache import Client
  #session.connect(request, response, db = MEMDB(Client()))

response.generic_patterns = ['*'] if request.is_local else []
try:
Example #18
0
class BOW_DB():
	FILENAME = 'bow.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DEFAULT_OPTIONS = {
		'max_freq_count' : 4000,          # Maximum number of terms to store
		'max_co_freq_count' : 100000      # Maximum number of term pairs to store
	}
	
	def __init__(self, path = None, isInit = False):
		self.isInit = isInit
		if path is not None:
			self.db = DAL(BOW_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path)
		else:
			self.db = DAL(BOW_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		self.DefineTermStatsTables()
		self.DefineTermCoStatsTables()
		self.DefineSentenceCoStatsTables()
		self.DefineTemporaryTable()
		return self

	def __exit__(self, type, value, traceback):
		self.db.commit()
	
################################################################################
	
	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		for key, value in BOW_DB.DEFAULT_OPTIONS.iteritems():
			self.SetOption( key, value, overwrite = self.isInit )


	def SetOption(self, key, value, overwrite = True):
		where = self.db.options.key == key
		if self.db( where ).count() > 0:
			if overwrite:
				self.db( where ).update( value = value )
		else:
			self.db.options.insert( key = key, value = value )
		self.db.commit()

	def GetOption(self, key):
		where = self.db.options.key == key
		keyValue = self.db( where ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

################################################################################

	def DefineTermStatsTables(self):
		self.db.define_table( 'term_texts',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text' , 'string' , required = True, unique = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);' )
	
		self.db.define_table( 'term_freqs',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);' )
	
		self.db.define_table( 'term_probs',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);' )
	
		self.db.define_table( 'term_doc_freqs',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);' )

	def DefineTermCoStatsTables(self):
		self.db.define_table( 'term_co_freqs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);' )

		self.db.define_table( 'term_co_probs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);' )

		self.db.define_table( 'term_g2',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);' )

	def DefineSentenceCoStatsTables(self):
		self.db.define_table( 'sentences_co_freqs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);' )

		self.db.define_table( 'sentences_co_probs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);' )

		self.db.define_table( 'sentences_g2',
			Field( 'first_term_index', 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);' )

################################################################################

	def Reset(self):
		self.db.executesql( 'DELETE FROM term_texts;' )
		self.db.executesql( 'DELETE FROM term_freqs;' )
		self.db.executesql( 'DELETE FROM term_probs;' )
		self.db.executesql( 'DELETE FROM term_doc_freqs;' )
		self.db.executesql( 'DELETE FROM term_co_freqs;' )
		self.db.executesql( 'DELETE FROM term_co_probs;' )
		self.db.executesql( 'DELETE FROM term_g2;' )
		self.db.executesql( 'DELETE FROM sentences_co_freqs;' )
		self.db.executesql( 'DELETE FROM sentences_co_probs;' )
		self.db.executesql( 'DELETE FROM sentences_g2;' )

################################################################################

	def DefineTemporaryTable(self):
		self.db.define_table( 'vocab',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text', 'string', required = True ),
			migrate = self.isInit
		)
		self.db.define_table( 'vocab_text',
			Field( 'term_text', 'string', required = True, unique = True, default = -1 ),
			migrate = self.isInit
		)
Example #19
0
current.cache = cache = None
current.T = T = None

def initVars():
	global current, request, response, session, cache, T
	current.request = request = Request()
	current.response = response = Response()
	current.session = session = Session()
	current.cache = cache = Cache(request)
	current.T = T = m__T__

initVars()

deleteDB()

db = DAL('sqlite://'+DB_PATH)




import gluon.tools as gt
from mock import Mock

gt.URL=Mock(side_effect=m__URL__)

crud = gt.Crud(db)


# # Alguns imports globais do web2py

# # Ja feitos
Example #20
0
Copyright © 2011-2015, Csaba Kappel
License: LGPLv3
http://www.nervatura.com/nerva2py/default/licenses
"""

if 0:
    global response
    response = globals.Response()
    global request
    request = globals.Request()
    from gluon.globals import Session
    global session
    session = Session()
    from gluon.sql import DAL
    global db
    db = DAL()
    import gluon.languages.translator as T
    from db import DEMO_MODE

from gluon.http import redirect
from gluon.sqlhtml import DIV, SPAN, A, INPUT, FORM
from gluon.html import BR, HR, SELECT, OPTION, P, IMG, TABLE, TR, TD
from gluon.validators import IS_NOT_EMPTY
from gluon.html import URL
from gluon.storage import Storage
import os

from gluon.sqlhtml import SQLFORM
from gluon.validators import IS_IN_DB, IS_IN_SET, IS_EMPTY_OR
from gluon.html import TBODY, THEAD, TH, TEXTAREA, CODE
from gluon.sql import Field
Example #21
0
"""
This file is part of the Nervatura Framework
http://www.nervatura.com
Copyright © 2011-2015, Csaba Kappel
License: LGPLv3
http://www.nervatura.com/nerva2py/default/licenses
"""

if 0:
  global response; response = globals.Response()
  global request; request = globals.Request()
  from gluon.globals import Session
  global session; session = Session()
  from gluon.sql import DAL
  global db; db = DAL()
  import gluon.languages.translator as T
  from db import DEMO_MODE

from gluon.http import redirect
from gluon.sqlhtml import DIV, SPAN, A, INPUT, FORM
from gluon.html import BR, HR, SELECT, OPTION, P, IMG, TABLE, TR, TD
from gluon.validators import IS_NOT_EMPTY
from gluon.html import URL
from gluon.storage import Storage 
import os

from gluon.sqlhtml import SQLFORM
from gluon.validators import IS_IN_DB, IS_IN_SET, IS_EMPTY_OR
from gluon.html import TBODY, THEAD, TH, TEXTAREA, CODE
from gluon.sql import Field
Example #22
0
class LDA_DB:
    FILENAME = "lda.db"
    CONNECTION = "sqlite://{}".format(FILENAME)
    DEFAULT_OPTIONS = {"max_co_topic_count": 40000}

    def __init__(self, path=None, isInit=False, isReset=False):
        self.isInit = isInit
        self.isReset = isReset

        if path is not None:
            self.db = DAL(LDA_DB.CONNECTION, lazy_tables=not self.isInit, migrate=self.isInit, folder=path)
        else:
            self.db = DAL(LDA_DB.CONNECTION, lazy_tables=not self.isInit, migrate=self.isInit)

    def __enter__(self):
        self.DefineOptionsTable()
        if self.isReset:
            self.Reset()
        self.DefineDimensionTables()
        self.DefineMatrixTables()
        self.DefineStatsTables()
        return self

    def __exit__(self, type, value, traceback):
        self.db.commit()

    ################################################################################

    def DefineOptionsTable(self):
        self.db.define_table(
            "options",
            Field("key", "string", required=True, unique=True),
            Field("value", "string", required=True),
            migrate=self.isInit,
        )
        if self.db(self.db.options).count() == 0:
            for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems():
                self.db.options.insert(key=key, value=value)

    def SetOption(self, key, value):
        keyValue = self.db(self.db.options.key == key).select().first()
        if keyValue:
            keyValue.update_record(value=value)
        else:
            self.db.options.insert(key=key, value=value)

    def GetOption(self, key):
        keyValue = self.db(self.db.options.key == key).select(self.db.options.value).first()
        if keyValue:
            return keyValue.value
        else:
            return None

    ################################################################################

    def DefineDimensionTables(self):
        self.db.define_table(
            "terms",
            Field("term_index", "integer", required=True, unique=True, default=-1),
            Field("term_text", "string", required=True, unique=True),
            Field("term_freq", "double", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);")
            self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS terms_text  ON terms (term_text);")
            self.db.executesql("CREATE        INDEX IF NOT EXISTS terms_freq  ON terms (term_freq);")
            self.db.executesql("CREATE        INDEX IF NOT EXISTS terms_rank  ON terms (rank);")

        self.db.define_table(
            "docs",
            Field("doc_index", "integer", required=True, unique=True, default=-1),
            Field("doc_id", "string", required=True, unique=True),
            Field("doc_freq", "double", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);")
            self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS docs_id    ON docs (doc_id);")
            self.db.executesql("CREATE        INDEX IF NOT EXISTS docs_freq  ON docs (doc_freq);")
            self.db.executesql("CREATE        INDEX IF NOT EXISTS docs_rank  ON docs (rank);")

        self.db.define_table(
            "topics",
            Field("topic_index", "integer", required=True, unique=True, default=-1),
            Field("topic_freq", "double", required=True),
            Field("topic_label", "string", required=True),
            Field("topic_desc", "string", required=True),
            Field("top_terms", "list:integer", required=True),
            Field("top_docs", "list:integer", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS topics_index ON topics (topic_index);")
            self.db.executesql("CREATE        INDEX IF NOT EXISTS topics_freq  ON topics (topic_freq);")
            self.db.executesql("CREATE        INDEX IF NOT EXISTS topics_rank  ON topics (rank);")

    def DefineMatrixTables(self):
        self.db.define_table(
            "term_topic_matrix",
            Field("term_index", "integer", required=True, default=-1),
            Field("topic_index", "integer", required=True, default=-1),
            Field("value", "double", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql(
                "CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (term_index, topic_index);"
            )
            self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_value      ON term_topic_matrix (value);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_rank       ON term_topic_matrix (rank);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_termindex  ON term_topic_matrix (term_index);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);")

        self.db.define_table(
            "doc_topic_matrix",
            Field("doc_index", "integer", required=True, default=-1),
            Field("topic_index", "integer", required=True, default=-1),
            Field("value", "double", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql(
                "CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (doc_index, topic_index);"
            )
            self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_value      ON doc_topic_matrix (value);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_rank       ON doc_topic_matrix (rank);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_docindex   ON doc_topic_matrix (doc_index);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);")

    def DefineStatsTables(self):
        self.db.define_table(
            "topic_cooccurrences",
            Field("first_topic_index", "integer", required=True, default=-1),
            Field("second_topic_index", "integer", required=True, default=-1),
            Field("value", "double", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql(
                "CREATE UNIQUE INDEX IF NOT EXISTS topic_cooccurrences_indexes ON topic_cooccurrences (first_topic_index, second_topic_index);"
            )
            self.db.executesql("CREATE INDEX IF NOT EXISTS topic_cooccurrences_value ON topic_cooccurrences (value);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS topic_cooccurrences_rank ON topic_cooccurrences (rank);")

        self.db.define_table(
            "topic_covariance",
            Field("first_topic_index", "integer", required=True, default=-1),
            Field("second_topic_index", "integer", required=True, default=-1),
            Field("value", "double", required=True),
            Field("rank", "integer", required=True),
            migrate=self.isInit,
        )
        if self.isInit:
            self.db.executesql(
                "CREATE UNIQUE INDEX IF NOT EXISTS topic_covariance_indexes ON topic_covariance (first_topic_index, second_topic_index);"
            )
            self.db.executesql("CREATE INDEX IF NOT EXISTS topic_covariance_value ON topic_covariance (value);")
            self.db.executesql("CREATE INDEX IF NOT EXISTS topic_covariance_rank ON topic_covariance (rank);")

    ################################################################################

    def Reset(self):
        self.db.executesql("DELETE FROM terms;")
        self.db.executesql("DELETE FROM docs;")
        self.db.executesql("DELETE FROM topics;")
        self.db.executesql("DELETE FROM term_topic_matrix;")
        self.db.executesql("DELETE FROM doc_topic_matrix;")
        self.db.executesql("DELETE FROM topic_cooccurrences;")
        self.db.executesql("DELETE FROM topic_covariance;")
Example #23
0
response.google_analytics_id = None

if not request.env.web2py_runtime_gae:
    request.data_folder = None
    if os.path.isdir(os.path.join('..', '..', 'data')):
        request.data_folder = os.path.join('..', '..', 'data')
    elif os.path.isdir(os.path.join('..', '..', 'databases')):
        request.data_folder = os.path.join('..', '..', 'databases')
    elif os.path.isdir(os.path.join('..', 'data')):
        request.data_folder = os.path.join('..', 'data')
    elif os.path.isdir(os.path.join('..', 'databases')):
        request.data_folder = os.path.join('..', 'databases')

    ename = "sqlite"
    db = DAL('sqlite://storage.sqlite',
             migrate=False,
             fake_migrate=False,
             folder=request.data_folder)
    session_db = DAL('sqlite://session.sqlite', folder=request.data_folder)
    session.connect(request, response, db=session_db)
    reload(sys)
    sys.setdefaultencoding("utf-8")  #@UndefinedVariable
else:
    ename = "google_datastore"
    #db = DAL('google:datastore://storage', migrate=False, fake_migrate=False)
    db = DAL('google:datastore',
             adapter_args={
                 'ndb_settings': None,
                 'use_ndb': False
             })
    session.connect(request, response, db=db)
    #from gluon.contrib.memdb import MEMDB
Example #24
0
class LDA_DB():
	FILENAME = 'lda.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DEFAULT_OPTIONS = {
		'max_co_topic_count' : 40000
	}
	
	def __init__(self, path = None, isInit = False, isReset = False):
		self.isInit = isInit
		self.isReset = isReset
		
		if path is not None:
			self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path)
		else:
			self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		if self.isReset:
			self.Reset()
		self.DefineDimensionTables()
		self.DefineMatrixTables()
		self.DefineStatsTables()
		return self
	
	def __exit__(self, type, value, traceback):
		self.db.commit()
	
################################################################################

	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		if self.db(self.db.options).count() == 0:
			for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems():
				self.db.options.insert( key = key, value = value )

	def SetOption(self, key, value):
		keyValue = self.db( self.db.options.key == key ).select().first()
		if keyValue:
			keyValue.update_record( value = value )
		else:
			self.db.options.insert( key = key, value = value )

	def GetOption(self, key):
		keyValue = self.db( self.db.options.key == key ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

################################################################################

	def DefineDimensionTables(self):
		self.db.define_table( 'terms',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text' , 'string' , required = True, unique = True ),
			Field( 'term_freq' , 'double' , required = True ),
			Field( 'rank'      , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_text  ON terms (term_text);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS terms_freq  ON terms (term_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS terms_rank  ON terms (rank);' )
			
		self.db.define_table( 'docs',
			Field( 'doc_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'doc_id'   , 'string' , required = True, unique = True ),
			Field( 'doc_freq' , 'double' , required = True ),
			Field( 'rank'     , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_id    ON docs (doc_id);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS docs_freq  ON docs (doc_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS docs_rank  ON docs (rank);' )
			
		self.db.define_table( 'topics',
			Field( 'topic_index', 'integer'     , required = True, unique = True, default = -1 ),
			Field( 'topic_freq' , 'double'      , required = True ),
			Field( 'topic_label', 'string'      , required = True ),
			Field( 'topic_desc' , 'string'      , required = True ),
			Field( 'top_terms'  , 'list:integer', required = True ),
			Field( 'top_docs'   , 'list:integer', required = True ),
			Field( 'rank'       , 'integer'     , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topics_index ON topics (topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_freq  ON topics (topic_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_rank  ON topics (rank);' )

	def DefineMatrixTables(self):
		self.db.define_table( 'term_topic_matrix',
			Field( 'term_index' , 'integer', required = True, default = -1 ),
			Field( 'topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (term_index, topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_value      ON term_topic_matrix (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_rank       ON term_topic_matrix (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_termindex  ON term_topic_matrix (term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);' )
		
		self.db.define_table( 'doc_topic_matrix',
			Field( 'doc_index'  , 'integer', required = True, default = -1 ),
			Field( 'topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (doc_index, topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_value      ON doc_topic_matrix (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_rank       ON doc_topic_matrix (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_docindex   ON doc_topic_matrix (doc_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);' )

	def DefineStatsTables(self):
		self.db.define_table( 'topic_cooccurrences',
			Field( 'first_topic_index' , 'integer', required = True, default = -1 ),
			Field( 'second_topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_cooccurrences_indexes ON topic_cooccurrences (first_topic_index, second_topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cooccurrences_value ON topic_cooccurrences (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cooccurrences_rank ON topic_cooccurrences (rank);' )

		self.db.define_table( 'topic_covariance',
			Field( 'first_topic_index', 'integer', required = True, default = -1 ),
			Field( 'second_topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_covariance_indexes ON topic_covariance (first_topic_index, second_topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_value ON topic_covariance (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_rank ON topic_covariance (rank);' )

################################################################################

	def Reset(self):
		self.db.executesql( 'DELETE FROM terms;' )
		self.db.executesql( 'DELETE FROM docs;' )
		self.db.executesql( 'DELETE FROM topics;' )
		self.db.executesql( 'DELETE FROM term_topic_matrix;' )
		self.db.executesql( 'DELETE FROM doc_topic_matrix;' )
		self.db.executesql( 'DELETE FROM topic_cooccurrences;' )
		self.db.executesql( 'DELETE FROM topic_covariance;' )
Example #25
0
class Corpus_DB():
	FILENAME = 'corpus.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DOC_IDS = ['doc_id', 'docid']
	DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body']
	MODEL_KEY = 'corpus'
	MODEL_DESC = 'Text Corpus'
	MODEL_ENTRY = { 'model_key' : MODEL_KEY, 'model_desc' : MODEL_DESC }
	LINEBREAKS_TABS = re.compile(r'[\t\r\n\f]')
	
	DEFAULT_OPTIONS = {
		'token_regex' : r'\w{3,}',        # Tokenize a corpus into a bag-of-words language model
		'min_freq' : 5,                   # Number of times a term must appear in the corpus
		'min_doc_freq' : 3                # Number of documents in which a terms must appear
	}
	
	def __init__(self, path = None, isInit = False):
		self.isInit = isInit
		if path is not None:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path)
		else:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		self.DefineModelsTable()
		self.DefineCorpusTable()
		self.DefineMetadataTables()
		return self

	def __exit__(self, type, value, traceback):
		self.DefineCorpusTextSearch()
		self.db.commit()
	
################################################################################
	
	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems():
			self.SetOption( key, value, overwrite = self.isInit )
			
	
	def SetOption(self, key, value, overwrite = True):
		where = self.db.options.key == key
		if self.db( where ).count() > 0:
			if overwrite:
				self.db( where ).update( value = value )
		else:
			self.db.options.insert( key = key, value = value )
		self.db.commit()

	def GetOption(self, key):
		where = self.db.options.key == key
		keyValue = self.db( where ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

	def DefineModelsTable(self):
		self.db.define_table( 'models',
			Field( 'model_key' , 'string', required = True, unique = True ),
			Field( 'model_desc', 'string', required = True ),
			migrate = self.isInit
		)

	def AddModel(self, model_key, model_desc):
		where = self.db.models.model_key == model_key
		if self.db( where ).count() > 0:
			self.db( where ).update( model_desc = model_desc )
		else:
			self.db.models.insert( model_key = model_key, model_desc = model_desc )
		self.db.commit()
	
	def GetModel(self, model_key):
		if model_key == Corpus_DB.MODEL_KEY:
			return Corpus_DB.MODEL_ENTRY
		where = self.db.models.model_key == model_key
		keyValue = self.db( where ).select( self.db.models.ALL ).first()
		if keyValue:
			return { 'model_key' : keyValue.model_key, 'model_desc' : keyValue.model_desc }
		else:
			return None

	def GetModels(self):
		rows = self.db( self.db.models ).select( self.db.models.model_key, self.db.models.model_desc ).as_list()
		return [ Corpus_DB.MODEL_ENTRY ] + rows

################################################################################

	def DefineCorpusTable(self):
		self.db.define_table( 'corpus',
			Field( 'doc_index'  , 'integer', required = True, unique = True, default = -1 ),
			Field( 'doc_id'     , 'string' , required = True, unique = True ),
			Field( 'doc_content', 'text'   , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id    ON corpus (doc_id);' )
	
	def DefineMetadataTables(self):
		self.db.define_table( 'fields',
			Field( 'field_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'field_name' , 'string' , required = True, unique = True ),
			Field( 'field_type' , 'string' , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name  ON fields (field_name);'  )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS field_type  ON fields (field_type);'  )
		
		self.db.define_table( 'metadata',
			Field( 'doc_index'  , 'integer', required = True, default = -1 ),
			Field( 'field_index', 'integer', required = True, default = -1 ),
			Field( 'value'      , 'text'   , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes     ON metadata (doc_index, field_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS metadata_doc_index   ON metadata (doc_index);'              )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);'            )

################################################################################

	def DefineCorpusTextSearch(self):
		if self.isInit:
			self.db.executesql( 'DROP TABLE IF EXISTS corpus_search;' )
			self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' )
			self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' )
	
################################################################################

	def SanitizeText(self, text):
		text = Corpus_DB.LINEBREAKS_TABS.sub(u' ', text).strip()
		return text

	def ImportFromFile(self, filename):
		"""
		filename = A plain-text file (utf-8 encoded) containing one document per line
		"""
		def ReadFile():
			with open(filename, 'r') as f:
				for index, line in enumerate(f):
					doc_index = index
					values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					if len(values) == 1:
						doc_id = 'doc{}'.format(doc_index+1)
						doc_content = values[0]
					else:
						doc_id = values[0]
						doc_content = values[1]
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8', 'ignore')
					}
		self.db.corpus.bulk_insert(ReadFile())

	def ImportFromFolder(self, glob_pattern):
		"""
		glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file)
		"""
		def ReadFolder():
			filenames = sorted(glob.glob(glob_pattern))
			for index, filename in enumerate(filenames):
				doc_index = index
				doc_id = filename
				with open(filename, 'r') as f:
					doc_content = f.read().decode('utf-8', 'ignore')
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8', 'ignore')
					}
		self.db.corpus.bulk_insert(ReadFolder())

	def ImportFromSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False):
		"""
		filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names)
		id_key = Name of the column containing unique document IDs
		content_key = Name of the column containing the document contents
		"""
		doc_id_keys = frozenset([id_key] if id_key is not None else Corpus_DB.DOC_IDS) 
		doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS)
		field_indexes = []
		field_names = []
		field_types = []
		metadata = []
		
		def ReadCSV():
			with open(filename, 'r') as f:
				reader = UnicodeReader(f)
				for row in reader:
					yield row

		def ReadTSV():
			with open(filename, 'r') as f:
				for line in f:
					yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					
		def ReadSpreadsheet(reader):
			field_doc_id = None
			field_doc_content = None
			for row_index, values in enumerate(reader):
				if row_index == 0:
					for index, field in enumerate(values):
						if field.lower() in doc_id_keys:
							field_doc_id = index
						elif field.lower() in doc_content_keys:
							field_doc_content = index
						else:
							field_index = len(field_indexes)
							field_indexes.append(field_index)
							field_names.append(field)
							field_types.append('integer')
				else:
					doc_index = row_index - 1
					doc_id = 'doc{:d}'.format(doc_index+1)
					doc_content = ''
					field_index = 0
					for index, value in enumerate(values):
						if field_doc_id == index:
							doc_id = value
						elif field_doc_content == index:
							doc_content = value
						else:
							metadata.append({
								'doc_index' : doc_index,
								'field_index' : field_index,
								'value' : value.encode('utf-8', 'ignore')
							})
						
							# [START] infer field type
							field_type = field_types[field_index]
							if field_type == 'integer':
								try:
									int(value)
								except ValueError:
									field_type = 'double'
							if field_type == 'double':
								try:
									float(value)
								except ValueError:
									field_type = 'string'
							field_types[field_index] = field_type
							# [END] infer field type
						
							field_index += 1
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8', 'ignore')
					}

		def GetFields():
			for field_index in field_indexes:
				field_name = field_names[field_index]
				field_type = field_types[field_index]
				yield {
					'field_index' : field_index,
					'field_name' : field_name,
					'field_type' : field_type
				}
						
		reader = ReadCSV() if is_csv else ReadTSV()
		self.db.corpus.bulk_insert(ReadSpreadsheet(reader))
		self.db.fields.bulk_insert(GetFields())
		self.db.metadata.bulk_insert(metadata)
		
	def ExportToFile(self, filename):
		"""
		filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents
		"""
		def WriteFile(rows):
			with open(filename, 'w') as f:
				for row in rows:
					doc_id = row.doc_id
					doc_content = self.SanitizeText(row.doc_content.decode('utf-8'))
					f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8', 'ignore'))

		rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index)
		WriteFile(rows)

	def ExportToSpreadsheet(self, filename, is_csv = False):
		"""
		filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata
		"""
		field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby = self.db.fields.field_index) ]
		field_count = len(field_names)
		all_field_names = [ 'doc_id', 'doc_content' ] + field_names

		def WriteCSV(rows):
			with open(filename, 'w') as f:
				writer = UnicodeWriter(f)
				writer.writerow(all_field_names)
				for row in rows:
					doc_index = row.doc_index
					doc_id = row.doc_id
					doc_content = self.SanitizeText(row.doc_content.decode('utf-8'))
					values = [u''] * field_count
					for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value):
						values[d.field_index] = self.SanitizeText(d.value.decode('utf-8'))
					all_values = [ doc_id, doc_content ] + values
					writer.writerow(all_values)
			
		def WriteTSV(rows):
			with open(filename, 'w') as f:
				f.write(u'{}\n'.format(u'\t'.join(all_field_names)).encode('utf-8', 'ignore'))
				for row in rows:
					doc_index = row.doc_index
					doc_id = row.doc_id
					doc_content = self.SanitizeText(row.doc_content.decode('utf-8'))
					values = [u''] * field_count
					for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value):
						values[d.field_index] = self.SanitizeText(d.value.decode('utf-8'))
					all_values = [ doc_id, doc_content ] + values
					f.write(u'{}\n'.format(u'\t'.join(all_values)).encode('utf-8', 'ignore'))

		rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index)
		if is_csv:
			WriteCSV(rows)
		else:
			WriteTSV(rows)
Example #26
0
class LDA_DB():
	FILENAME = 'lda.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DEFAULT_OPTIONS = {
		'max_co_topic_count' : 10000      # Number of topic pairs to store
	}
	
	def __init__(self, path = None, isInit = False):
		self.isInit = isInit
		if path is not None:
			self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path)
		else:
			self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		self.DefineDimensionTables()
		self.DefineMatrixTables()
		self.DefineStatsTables()
		return self
	
	def __exit__(self, type, value, traceback):
		self.db.commit()
	
################################################################################

	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems():
			self.SetOption( key, value, overwrite = self.isInit )


	def SetOption(self, key, value, overwrite = True):
		where = self.db.options.key == key
		if self.db( where ).count() > 0:
			if overwrite:
				self.db( where ).update( value = value )
		else:
			self.db.options.insert( key = key, value = value )
		self.db.commit()

	def GetOption(self, key):
		where = self.db.options.key == key
		keyValue = self.db( where ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

################################################################################

	def DefineDimensionTables(self):
		self.db.define_table( 'terms',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text' , 'string' , required = True, unique = True ),
			Field( 'term_freq' , 'double' , required = True ),
			Field( 'rank'      , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_text  ON terms (term_text);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS terms_freq  ON terms (term_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS terms_rank  ON terms (rank);' )
			
		self.db.define_table( 'docs',
			Field( 'doc_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'doc_id'   , 'string' , required = True, unique = True ),
			Field( 'doc_freq' , 'double' , required = True ),
			Field( 'rank'     , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_id    ON docs (doc_id);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS docs_freq  ON docs (doc_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS docs_rank  ON docs (rank);' )
			
		self.db.define_table( 'topics',
			Field( 'topic_index', 'integer'     , required = True, unique = True, default = -1 ),
			Field( 'topic_freq' , 'double'      , required = True ),
			Field( 'topic_label', 'string'      , required = True ),
			Field( 'topic_desc' , 'string'      , required = True ),
			Field( 'top_terms'  , 'list:integer', required = True ),
			Field( 'top_docs'   , 'list:integer', required = True ),
			Field( 'rank'       , 'integer'     , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topics_index ON topics (topic_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_freq  ON topics (topic_freq);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS topics_rank  ON topics (rank);' )

	def DefineMatrixTables(self):
		self.db.define_table( 'term_topic_matrix',
			Field( 'term_index' , 'integer', required = True, default = -1 ),
			Field( 'topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (term_index, topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_value      ON term_topic_matrix (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_rank       ON term_topic_matrix (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_termindex  ON term_topic_matrix (term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);' )
		
		self.db.define_table( 'doc_topic_matrix',
			Field( 'doc_index'  , 'integer', required = True, default = -1 ),
			Field( 'topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (doc_index, topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_value      ON doc_topic_matrix (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_rank       ON doc_topic_matrix (rank);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_docindex   ON doc_topic_matrix (doc_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);' )

	def DefineStatsTables(self):
		self.db.define_table( 'topic_covariance',
			Field( 'first_topic_index', 'integer', required = True, default = -1 ),
			Field( 'second_topic_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_covariance_indexes ON topic_covariance (first_topic_index, second_topic_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_value ON topic_covariance (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_rank ON topic_covariance (rank);' )

################################################################################

	def Reset(self):
		self.db.executesql( 'DELETE FROM terms;' )
		self.db.executesql( 'DELETE FROM docs;' )
		self.db.executesql( 'DELETE FROM topics;' )
		self.db.executesql( 'DELETE FROM term_topic_matrix;' )
		self.db.executesql( 'DELETE FROM doc_topic_matrix;' )
		self.db.executesql( 'DELETE FROM topic_covariance;' )
from gluon.sql import DAL
from gluon.sql import Field
from gluon.sql import SQLDB
from gluon.sqlhtml import SQLFORM
from gluon.validators import *
from gluon import fileutils
from gluon.http import *
from gluon.sqlhtml import *
from gluon.tools import fetch
import datetime
from datetime import timedelta
from datetime import date  #

configuration = AppConfig()

db = DAL('sqlite://storage.sqlite')

MIGRATE_SETTING = False

db.define_table("admins",
                Field("administrator"),
                Field("admin_key", "password"),
                migrate=MIGRATE_SETTING)
db.define_table("id_refs",
                Field("classroom_id_ref", default="0"),
                Field("quiz_id_ref", default="0"),
                Field("quiz_question_id_ref", default="0"),
                Field("student_id_ref", default="0"),
                Field("teacher_id_ref", default="0"),
                migrate=MIGRATE_SETTING)
db.define_table("teachers",
Example #28
0
class BOW_DB():
    FILENAME = 'bow.db'
    CONNECTION = 'sqlite://{}'.format(FILENAME)
    DEFAULT_OPTIONS = {
        'max_freq_count': 4000,  # Maximum number of terms to store
        'max_co_freq_count': 100000  # Maximum number of term pairs to store
    }

    def __init__(self, path=None, isInit=False):
        self.isInit = isInit
        if path is not None:
            self.db = DAL(BOW_DB.CONNECTION,
                          lazy_tables=not self.isInit,
                          migrate_enabled=self.isInit,
                          folder=path)
        else:
            self.db = DAL(BOW_DB.CONNECTION,
                          lazy_tables=not self.isInit,
                          migrate_enabled=self.isInit)

    def __enter__(self):
        self.DefineOptionsTable()
        self.DefineTermStatsTables()
        self.DefineTermCoStatsTables()
        self.DefineSentenceCoStatsTables()
        self.DefineTemporaryTable()
        return self

    def __exit__(self, type, value, traceback):
        self.db.commit()

################################################################################

    def DefineOptionsTable(self):
        self.db.define_table('options',
                             Field('key', 'string', required=True,
                                   unique=True),
                             Field('value', 'string', required=True),
                             migrate=self.isInit)
        for key, value in BOW_DB.DEFAULT_OPTIONS.iteritems():
            self.SetOption(key, value, overwrite=self.isInit)

    def SetOption(self, key, value, overwrite=True):
        where = self.db.options.key == key
        if self.db(where).count() > 0:
            if overwrite:
                self.db(where).update(value=value)
        else:
            self.db.options.insert(key=key, value=value)
        self.db.commit()

    def GetOption(self, key):
        where = self.db.options.key == key
        keyValue = self.db(where).select(self.db.options.value).first()
        if keyValue:
            return keyValue.value
        else:
            return None

################################################################################

    def DefineTermStatsTables(self):
        self.db.define_table('term_texts',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('term_text',
                                   'string',
                                   required=True,
                                   unique=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);'
            )

        self.db.define_table('term_freqs',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);'
            )

        self.db.define_table('term_probs',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);'
            )

        self.db.define_table('term_doc_freqs',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);'
            )

    def DefineTermCoStatsTables(self):
        self.db.define_table('term_co_freqs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);'
            )

        self.db.define_table('term_co_probs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);'
            )

        self.db.define_table('term_g2',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);')
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);')

    def DefineSentenceCoStatsTables(self):
        self.db.define_table('sentences_co_freqs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);'
            )

        self.db.define_table('sentences_co_probs',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);'
            )

        self.db.define_table('sentences_g2',
                             Field('first_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('second_term_index',
                                   'integer',
                                   required=True,
                                   default=-1),
                             Field('value', 'double', required=True),
                             Field('rank', 'integer', required=True),
                             migrate=self.isInit)
        if self.isInit:
            self.db.executesql(
                'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);'
            )
            self.db.executesql(
                'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);'
            )

################################################################################

    def Reset(self):
        self.db.executesql('DELETE FROM term_texts;')
        self.db.executesql('DELETE FROM term_freqs;')
        self.db.executesql('DELETE FROM term_probs;')
        self.db.executesql('DELETE FROM term_doc_freqs;')
        self.db.executesql('DELETE FROM term_co_freqs;')
        self.db.executesql('DELETE FROM term_co_probs;')
        self.db.executesql('DELETE FROM term_g2;')
        self.db.executesql('DELETE FROM sentences_co_freqs;')
        self.db.executesql('DELETE FROM sentences_co_probs;')
        self.db.executesql('DELETE FROM sentences_g2;')

################################################################################

    def DefineTemporaryTable(self):
        self.db.define_table('vocab',
                             Field('term_index',
                                   'integer',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             Field('term_text', 'string', required=True),
                             migrate=self.isInit)
        self.db.define_table('vocab_text',
                             Field('term_text',
                                   'string',
                                   required=True,
                                   unique=True,
                                   default=-1),
                             migrate=self.isInit)
Example #29
0
def test_DAL_person_table():
    conn_str = "postgres://*****:*****@tornado/joseph"
    db = DAL(conn_str, check_reserved=['postgres'])
    db.define_table('person', Field('name'))
    db.person.insert(name="Alex")
    db.commit()
Example #30
0
class Corpus_DB():
	FILENAME = 'corpus.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DOC_IDS = ['doc_id', 'docid']
	DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body']
	DEFAULT_OPTIONS = {
		'token_regex' : r'\w{3,}',
		'min_freq' : 5,
		'min_doc_freq' : 3,
		'max_freq_count'    :   4000,
		'max_co_freq_count' : 160000,
		'max_g2_count'      : 160000
	}
	
	def __init__(self, path = None, isInit = False, isImport = False, isReset = False):
		self.isInit = isInit
		self.isImport = isImport
		self.isReset = isReset
		
		isInitOrImport = self.isInit or self.isImport
		if path is not None:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not isInitOrImport, migrate_enabled = isInitOrImport, folder = path)
		else:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not isInitOrImport, migrate_enabled = isInitOrImport)

	def __enter__(self):
		self.DefineOptionsTable()
		self.DefineModelsTable()
		self.DefineCorpusTable()
		self.DefineMetadataTables()
		if self.isReset:
			self.Reset()
		self.DefineTermStatsTables()
		self.DefineTermCoStatsTables()
		self.DefineSentenceCoStatsTables()
		self.DefineTemporaryTable()
		return self

	def __exit__(self, type, value, traceback):
		self.DefineCorpusTextSearch()
		self.db.commit()
	
################################################################################
	
	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isImport
		)
		if self.isInit:
			for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems():
				keyValue = self.db( self.db.options.key == key ).select().first()
				if keyValue:
					keyValue.update_record( value = value )
				else:
					self.db.options.insert( key = key, value = value )
	
	def SetOption(self, key, value):
		keyValue = self.db( self.db.options.key == key ).select().first()
		if keyValue:
			keyValue.update_record( value = value )
		else:
			self.db.options.insert( key = key, value = value )

	def GetOption(self, key):
		keyValue = self.db( self.db.options.key == key ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

	def DefineModelsTable(self):
		self.db.define_table( 'models',
			Field( 'model_key' , 'string', required = True, unique = True ),
			Field( 'model_desc', 'string', required = True ),
			migrate = self.isImport
		)

	def AddModel(self, model_key, model_desc):
		keyDesc = self.db( self.db.models.model_key == model_key ).select().first()
		if keyDesc:
			keyDesc.update_record( model_desc = model_desc )
		else:
			self.db.models.insert( model_key = model_key, model_desc = model_desc )

	def GetModels(self):
		models = self.db( self.db.models ).select( self.db.models.model_key )
		return [ model.model_key for model in models ]

	def GetModelDescription(self, model_key):
		model = self.db( self.db.models.model_key == model_key ).select().first()
		if model:
			return model.model_desc
		else:
			return None

################################################################################

	def DefineCorpusTable(self):
		self.db.define_table( 'corpus',
			Field( 'doc_index'  , 'integer', required = True, unique = True, default = -1 ),
			Field( 'doc_id'     , 'string' , required = True, unique = True ),
			Field( 'doc_content', 'text'   , required = True ),
			migrate = self.isImport
		)
		if self.isImport:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id    ON corpus (doc_id);' )
	
	def DefineMetadataTables(self):
		self.db.define_table( 'fields',
			Field( 'field_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'field_name' , 'string' , required = True, unique = True ),
			Field( 'field_type' , 'string' , required = True ),
			migrate = self.isImport
		)
		if self.isImport:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name  ON fields (field_name);'  )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS field_type  ON fields (field_type);'  )
		
		self.db.define_table( 'metadata',
			Field( 'doc_index'  , 'integer', required = True, default = -1 ),
			Field( 'field_index', 'integer', required = True, default = -1 ),
			Field( 'value'      , 'text'   , required = True ),
			migrate = self.isImport
		)
		if self.isImport:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes     ON metadata (doc_index, field_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS metadata_doc_index   ON metadata (doc_index);'              )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);'            )

################################################################################

	def DefineCorpusTextSearch(self):
		if self.isImport:
			self.db.executesql( 'DROP TABLE IF EXISTS corpus_search;' )
			self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' )
			self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' )
	
################################################################################

	def ImportFromFile(self, filename):
		"""
		filename = A plain-text file (utf-8 encoded) containing one document per line
		"""
		def ReadFile():
			with open(filename, 'r') as f:
				for index, line in enumerate(f):
					doc_index = index
					values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					if len(values) == 1:
						doc_id = 'doc{}'.format(doc_index+1)
						doc_content = values[0]
					else:
						doc_id = values[0]
						doc_content = values[1]
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8')
					}
		self.db.corpus.bulk_insert(ReadFile())

	def ImportFromFolder(self, glob_pattern):
		"""
		glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file)
		"""
		def ReadFolder():
			filenames = sorted(glob.glob(glob_pattern))
			for index, filename in enumerate(filenames):
				doc_index = index
				doc_id = filename
				with open(filename, 'r') as f:
					doc_content = f.read().decode('utf-8', 'ignore')
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8')
					}
		self.db.corpus.bulk_insert(ReadFolder())

	def ImportFromSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False):
		"""
		filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names)
		id_key = Name of the column containing unique document IDs
		content_key = Name of the column containing the document contents
		"""
		doc_id_keys = frozenset([id_key] if id_key is not None else Corpus_DB.DOC_IDS) 
		doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS)
		field_indexes = []
		field_names = []
		field_types = []
		metadata = []
		
		def ReadCSV():
			with open(filename, 'r') as f:
				reader = UnicodeReader(f)
				for row in reader:
					yield row

		def ReadTSV():
			with open(filename, 'r') as f:
				for line in f:
					yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					
		def ReadSpreadsheet(reader):
			field_doc_id = None
			field_doc_content = None
			for row_index, values in enumerate(reader):
				if row_index == 0:
					for index, field in enumerate(values):
						if field.lower() in doc_id_keys:
							field_doc_id = index
						elif field.lower() in doc_content_keys:
							field_doc_content = index
						else:
							field_index = len(field_indexes)
							field_indexes.append(field_index)
							field_names.append(field)
							field_types.append('integer')
				else:
					doc_index = row_index - 1
					doc_id = 'doc{:d}'.format(doc_index+1)
					doc_content = ''
					field_index = 0
					for index, value in enumerate(values):
						if field_doc_id == index:
							doc_id = value
						elif field_doc_content == index:
							doc_content = value
						else:
							metadata.append({
								'doc_index' : doc_index,
								'field_index' : field_index,
								'value' : value.encode('utf-8')
							})
						
							# [START] infer field type
							field_type = field_types[field_index]
							if field_type == 'integer':
								try:
									int(value)
								except ValueError:
									field_type = 'double'
							if field_type == 'double':
								try:
									float(value)
								except ValueError:
									field_type = 'string'
							field_types[field_index] = field_type
							# [END] infer field type
						
							field_index += 1
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8')
					}

		def GetFields():
			for field_index in field_indexes:
				field_name = field_names[field_index]
				field_type = field_types[field_index]
				yield {
					'field_index' : field_index,
					'field_name' : field_name,
					'field_type' : field_type
				}
						
		reader = ReadCSV() if is_csv else ReadTSV()
		self.db.corpus.bulk_insert(ReadSpreadsheet(reader))
		self.db.fields.bulk_insert(GetFields())
		self.db.metadata.bulk_insert(metadata)
		
	def ExportToFile(self, filename):
		"""
		filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents
		"""
		def WriteFile(rows):
			m = re.compile(r'\s+')
			with open(filename, 'w') as f:
				for row in rows:
					doc_id = row.doc_id
					doc_content = m.sub(u' ', row.doc_content.decode('utf-8'))
					f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8'))

		rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index)
		WriteFile(rows)

	def ExportToSpreadsheet(self, filename, is_csv = False):
		"""
		filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata
		"""
		field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby = self.db.fields.field_index) ]
		field_count = len(field_names)
		all_field_names = [ 'doc_id', 'doc_content' ] + field_names

		def WriteCSV(rows):
			m = re.compile(r'\s+')
			with open(filename, 'w') as f:
				writer = UnicodeWriter(f)
				writer.writerow(all_field_names)
				for row in rows:
					doc_index = row.doc_index
					doc_id = row.doc_id
					doc_content = m.sub(u' ', row.doc_content.decode('utf-8'))
					values = [u''] * field_count
					for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value):
						values[d.field_index] = m.sub(u' ', d.value.decode('utf-8'))
					all_values = [ doc_id, doc_content ] + values
					writer.writerow(all_values)
			
		def WriteTSV(rows):
			m = re.compile(r'\s+')
			with open(filename, 'w') as f:
				f.write(u'{}\n'.format(u'\t'.join(all_field_names)).encode('utf-8'))
				for row in rows:
					doc_index = row.doc_index
					doc_id = row.doc_id
					doc_content = m.sub(u' ', row.doc_content.decode('utf-8'))
					values = [u''] * field_count
					for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value):
						values[d.field_index] = m.sub(u' ', d.value.decode('utf-8'))
					all_values = [ doc_id, doc_content ] + values
					f.write(u'{}\n'.format(u'\t'.join(all_values)).encode('utf-8'))

		rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index)
		if is_csv:
			WriteCSV(rows)
		else:
			WriteTSV(rows)

################################################################################

	def DefineTermStatsTables(self):
		self.db.define_table( 'term_texts',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text' , 'string' , required = True, unique = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);' )
	
		self.db.define_table( 'term_freqs',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);' )
	
		self.db.define_table( 'term_probs',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);' )
	
		self.db.define_table( 'term_doc_freqs',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);' )

	def DefineTermCoStatsTables(self):
		self.db.define_table( 'term_co_freqs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);' )

		self.db.define_table( 'term_co_probs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);' )

		self.db.define_table( 'term_g2',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);' )

	def DefineSentenceCoStatsTables(self):
		self.db.define_table( 'sentences_co_freqs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);' )

		self.db.define_table( 'sentences_co_probs',
			Field( 'first_term_index' , 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);' )

		self.db.define_table( 'sentences_g2',
			Field( 'first_term_index', 'integer', required = True, default = -1 ),
			Field( 'second_term_index', 'integer', required = True, default = -1 ),
			Field( 'value', 'double' , required = True ),
			Field( 'rank' , 'integer', required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);' )
			self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);' )

################################################################################

	def Reset(self):
		self.db.executesql( 'DELETE FROM term_texts;' )
		self.db.executesql( 'DELETE FROM term_freqs;' )
		self.db.executesql( 'DELETE FROM term_probs;' )
		self.db.executesql( 'DELETE FROM term_doc_freqs;' )
		self.db.executesql( 'DELETE FROM term_co_freqs;' )
		self.db.executesql( 'DELETE FROM term_co_probs;' )
		self.db.executesql( 'DELETE FROM term_g2;' )
		self.db.executesql( 'DELETE FROM sentences_co_freqs;' )
		self.db.executesql( 'DELETE FROM sentences_co_probs;' )
		self.db.executesql( 'DELETE FROM sentences_g2;' )

################################################################################

	def DefineTemporaryTable(self):
		self.db.define_table( 'vocab',
			Field( 'term_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'term_text', 'string', required = True ),
			migrate = self.isInit
		)
		self.db.define_table( 'vocab_text',
			Field( 'term_text', 'string', required = True, unique = True, default = -1 ),
			migrate = self.isInit
		)
class Corpus_DB():
	FILENAME = 'corpus.db'
	CONNECTION = 'sqlite://{}'.format(FILENAME)
	DOC_IDS = ['doc_id', 'docid']
	DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body']
	MODEL_KEY = 'corpus'
	MODEL_DESC = 'Text Corpus'
	MODEL_ENTRY = { 'model_key' : MODEL_KEY, 'model_desc' : MODEL_DESC }
	LINEBREAKS_TABS = re.compile(r'[\t\r\n\f]')
	
	DEFAULT_OPTIONS = {
		'token_regex' : r'\w{3,}',        # Tokenize a corpus into a bag-of-words language model
		'min_freq' : 5,                   # Number of times a term must appear in the corpus
		'min_doc_freq' : 3                # Number of documents in which a terms must appear
	}
	
	def __init__(self, path = None, isInit = False):
		self.isInit = isInit
		if path is not None:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path)
		else:
			self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit)

	def __enter__(self):
		self.DefineOptionsTable()
		self.DefineModelsTable()
		self.DefineCorpusTable()
		self.DefineMetadataTables()
		return self

	def __exit__(self, type, value, traceback):
		self.DefineCorpusTextSearch()
		self.db.commit()
	
################################################################################
	
	def DefineOptionsTable(self):
		self.db.define_table( 'options',
			Field( 'key'  , 'string', required = True, unique = True ),
			Field( 'value', 'string', required = True ),
			migrate = self.isInit
		)
		for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems():
			self.SetOption( key, value, overwrite = self.isInit )
			
	
	def SetOption(self, key, value, overwrite = True):
		where = self.db.options.key == key
		if self.db( where ).count() > 0:
			if overwrite:
				self.db( where ).update( value = value )
		else:
			self.db.options.insert( key = key, value = value )
		self.db.commit()

	def GetOption(self, key):
		where = self.db.options.key == key
		keyValue = self.db( where ).select( self.db.options.value ).first()
		if keyValue:
			return keyValue.value
		else:
			return None

	def DefineModelsTable(self):
		self.db.define_table( 'models',
			Field( 'model_key' , 'string', required = True, unique = True ),
			Field( 'model_desc', 'string', required = True ),
			migrate = self.isInit
		)

	def AddModel(self, model_key, model_desc):
		where = self.db.models.model_key == model_key
		if self.db( where ).count() > 0:
			self.db( where ).update( model_desc = model_desc )
		else:
			self.db.models.insert( model_key = model_key, model_desc = model_desc )
		self.db.commit()
	
	def GetModel(self, model_key):
		if model_key == Corpus_DB.MODEL_KEY:
			return Corpus_DB.MODEL_ENTRY
		where = self.db.models.model_key == model_key
		keyValue = self.db( where ).select( self.db.models.ALL ).first()
		if keyValue:
			return { 'model_key' : keyValue.model_key, 'model_desc' : keyValue.model_desc }
		else:
			return None

	def GetModels(self):
		rows = self.db( self.db.models ).select( self.db.models.model_key, self.db.models.model_desc ).as_list()
		return [ Corpus_DB.MODEL_ENTRY ] + rows

################################################################################

	def DefineCorpusTable(self):
		self.db.define_table( 'corpus',
			Field( 'doc_index'  , 'integer', required = True, unique = True, default = -1 ),
			Field( 'doc_id'     , 'string' , required = True, unique = True ),
			Field( 'doc_content', 'text'   , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id    ON corpus (doc_id);' )
	
	def DefineMetadataTables(self):
		self.db.define_table( 'fields',
			Field( 'field_index', 'integer', required = True, unique = True, default = -1 ),
			Field( 'field_name' , 'string' , required = True, unique = True ),
			Field( 'field_type' , 'string' , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' )
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name  ON fields (field_name);'  )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS field_type  ON fields (field_type);'  )
		
		self.db.define_table( 'metadata',
			Field( 'doc_index'  , 'integer', required = True, default = -1 ),
			Field( 'field_index', 'integer', required = True, default = -1 ),
			Field( 'value'      , 'text'   , required = True ),
			migrate = self.isInit
		)
		if self.isInit:
			self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes     ON metadata (doc_index, field_index);' )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS metadata_doc_index   ON metadata (doc_index);'              )
			self.db.executesql( 'CREATE        INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);'            )

################################################################################

	def DefineCorpusTextSearch(self):
		if self.isInit:
			pass
#			self.db.executesql( 'DROP TABLE IF EXISTS corpus_search;' )
#			self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' )
#			self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' )
	
################################################################################

	def SanitizeText(self, text):
		text = Corpus_DB.LINEBREAKS_TABS.sub(u' ', text).strip()
		return text

	def ImportFromFile(self, filename):
		"""
		filename = A plain-text file (utf-8 encoded) containing one document per line
		"""
		def ReadFile():
			with open(filename, 'r') as f:
				for index, line in enumerate(f):
					doc_index = index
					values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					if len(values) == 1:
						doc_id = 'doc{}'.format(doc_index+1)
						doc_content = values[0]
					else:
						doc_id = values[0]
						doc_content = values[1]
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8', 'ignore')
					}
		self.db.corpus.bulk_insert(ReadFile())

	def ImportFromFolder(self, glob_pattern):
		"""
		glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file)
		"""
		def ReadFolder():
			filenames = sorted(glob.glob(glob_pattern))
			for index, filename in enumerate(filenames):
				doc_index = index
				doc_id = filename
				with open(filename, 'r') as f:
					doc_content = f.read().decode('utf-8', 'ignore')
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8', 'ignore')
					}
		self.db.corpus.bulk_insert(ReadFolder())

	def ImportFromSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False):
		"""
		filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names)
		id_key = Name of the column containing unique document IDs
		content_key = Name of the column containing the document contents
		"""
		doc_id_keys = frozenset([id_key] if id_key is not None else Corpus_DB.DOC_IDS) 
		doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS)
		field_indexes = []
		field_names = []
		field_types = []
		metadata = []
		
		def ReadCSV():
			with open(filename, 'r') as f:
				reader = UnicodeReader(f)
				for row in reader:
					yield row

		def ReadTSV():
			with open(filename, 'r') as f:
				for line in f:
					yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					
		def ReadSpreadsheet(reader):
			field_doc_id = None
			field_doc_content = None
			for row_index, values in enumerate(reader):
				if row_index == 0:
					for index, field in enumerate(values):
						if field.lower() in doc_id_keys:
							field_doc_id = index
						elif field.lower() in doc_content_keys:
							field_doc_content = index
						else:
							field_index = len(field_indexes)
							field_indexes.append(field_index)
							field_names.append(field)
							field_types.append('integer')
				else:
					doc_index = row_index - 1
					doc_id = 'doc{:d}'.format(doc_index+1)
					doc_content = ''
					field_index = 0
					for index, value in enumerate(values):
						if field_doc_id == index:
							doc_id = value
						elif field_doc_content == index:
							doc_content = value
						else:
							metadata.append({
								'doc_index' : doc_index,
								'field_index' : field_index,
								'value' : value.encode('utf-8', 'ignore')
							})
						
							# [START] infer field type
							field_type = field_types[field_index]
							if field_type == 'integer':
								try:
									int(value)
								except ValueError:
									field_type = 'double'
							if field_type == 'double':
								try:
									float(value)
								except ValueError:
									field_type = 'string'
							field_types[field_index] = field_type
							# [END] infer field type
						
							field_index += 1
					yield {
						'doc_index' : doc_index,
						'doc_id' : doc_id,
						'doc_content' : doc_content.encode('utf-8', 'ignore')
					}

		def GetFields():
			for field_index in field_indexes:
				field_name = field_names[field_index]
				field_type = field_types[field_index]
				yield {
					'field_index' : field_index,
					'field_name' : field_name,
					'field_type' : field_type
				}
						
		reader = ReadCSV() if is_csv else ReadTSV()
		self.db.corpus.bulk_insert(ReadSpreadsheet(reader))
		self.db.fields.bulk_insert(GetFields())
		self.db.metadata.bulk_insert(metadata)
		
	def ExportToFile(self, filename):
		"""
		filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents
		"""
		def WriteFile(rows):
			with open(filename, 'w') as f:
				for row in rows:
					doc_id = row.doc_id
					doc_content = self.SanitizeText(row.doc_content.decode('utf-8'))
					f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8', 'ignore'))

		rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index)
		WriteFile(rows)

	def ExportToSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False):
		"""
		filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata
		"""
		field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby = self.db.fields.field_index) ]
		field_count = len(field_names)
		all_field_names = [
			id_key if id_key is not None else 'doc_id',
			content_key if content_key is not None else 'doc_content'
		] + field_names

		def WriteCSV(rows):
			with open(filename, 'w') as f:
				writer = UnicodeWriter(f)
				writer.writerow(all_field_names)
				for row in rows:
					doc_index = row.doc_index
					doc_id = row.doc_id
					doc_content = self.SanitizeText(row.doc_content.decode('utf-8'))
					values = [u''] * field_count
					for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value):
						values[d.field_index] = self.SanitizeText(d.value.decode('utf-8'))
					all_values = [ doc_id, doc_content ] + values
					writer.writerow(all_values)
			
		def WriteTSV(rows):
			with open(filename, 'w') as f:
				f.write(u'{}\n'.format(u'\t'.join(all_field_names)).encode('utf-8', 'ignore'))
				for row in rows:
					doc_index = row.doc_index
					doc_id = row.doc_id
					doc_content = self.SanitizeText(row.doc_content.decode('utf-8'))
					values = [u''] * field_count
					for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value):
						values[d.field_index] = self.SanitizeText(d.value.decode('utf-8'))
					all_values = [ doc_id, doc_content ] + values
					f.write(u'{}\n'.format(u'\t'.join(all_values)).encode('utf-8', 'ignore'))

		rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index)
		if is_csv:
			WriteCSV(rows)
		else:
			WriteTSV(rows)
Example #32
0
http://www.nervatura.com
Copyright © 2011-2015, Csaba Kappel
License: LGPLv3
http://www.nervatura.com/nerva2py/default/licenses
"""

if 0:
    from gluon.globals import Session
    global session
    session = Session()
    global request
    request = globals.Request()
    import gluon.languages.translator as T
    from gluon.sql import DAL
    global db
    db = DAL()
    global response
    response = globals.Response()

import pyamf
from pyamf.flex import ArrayCollection
from gluon.tools import Service

from nerva2py.nervastore import NervaStore
from nerva2py.tools import NervaTools
from nerva2py.npi import Npi
import nerva2py.models

if request.env.http_origin:
    response.headers['Access-Control-Allow-Origin'] = request.env.http_origin
else:
Example #33
0
http://www.nervatura.com/nerva2py/default/licenses
"""

if 0:
    global response
    response = globals.Response()
    global request
    request = globals.Request()
    from gluon.globals import Session

    global session
    session = Session()
    from gluon.sql import DAL

    global db
    db = DAL()
    import gluon.languages.translator as T
    from db import DEMO_MODE

from gluon.http import redirect
from gluon.sqlhtml import DIV, SPAN, A, INPUT, FORM
from gluon.html import BR, HR, SELECT, OPTION, P, IMG, TABLE, TR, TD
from gluon.validators import IS_NOT_EMPTY
from gluon.html import URL
from gluon.storage import Storage
import os

from gluon.sqlhtml import SQLFORM
from gluon.validators import IS_IN_DB, IS_IN_SET, IS_EMPTY_OR
from gluon.html import TBODY, THEAD, TH, TEXTAREA, CODE
from gluon.sql import Field