class ITM_DB(): FILENAME = 'itm.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = {} def __init__(self, path=None, isInit=False): self.isInit = isInit if path is not None: self.db = DAL(ITM_DB.CONNECTION, lazy_tables=not self.isInit, migrate=self.isInit, folder=path) else: self.db = DAL(ITM_DB.CONNECTION, lazy_tables=not self.isInit, migrate=self.isInit) def __enter__(self): self.DefineOptionsTable() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table('options', Field('key', 'string', required=True, unique=True), Field('value', 'string', required=True), migrate=self.isInit) for key, value in ITM_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption(key, value, overwrite=self.isInit) def SetOption(self, key, value, overwrite=True): where = self.db.options.key == key if self.db(where).count() > 0: if overwrite: self.db(where).update(value=value) else: self.db.options.insert(key=key, value=value) self.db.commit() def GetOption(self, key): where = self.db.options.key == key keyValue = self.db(where).select(self.db.options.value).first() if keyValue: return keyValue.value else: return None ################################################################################ def Reset(self): pass
class ITM_DB(): FILENAME = 'itm.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = {} def __init__(self, path = None, isInit = False, isReset = False): self.isInit = isInit self.isReset = isReset if path is not None: self.db = DAL(ITM_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path) else: self.db = DAL(ITM_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit) def __enter__(self): self.DefineOptionsTable() if self.isReset: self.Reset() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) if self.db(self.db.options).count() == 0: for key, value in ITM_DB.DEFAULT_OPTIONS.iteritems(): self.db.options.insert( key = key, value = value ) def SetOption(self, key, value): keyValue = self.db( self.db.options.key == key ).select().first() if keyValue: keyValue.update_record( value = value ) else: self.db.options.insert( key = key, value = value ) def GetOption(self, key): keyValue = self.db( self.db.options.key == key ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None def Reset(self): pass
class TestModuleSetup(unittest.TestCase): """docstring for TestModuleSetup""" def setUp(self): db_username_postgres = 'postgres' db_password_postgres = '1234' db_postgres_url = 'postgres://' + db_username_postgres + ':' + db_password_postgres + '@localhost/dev' path_to_database = path.join(path.curdir, "../databases") self.db_test = DAL(db_postgres_url, folder=path_to_database) self.db_test.import_table_definitions(path_to_database) def limpa_dados_tabela(self, nome_tabela): self.db_test.executesql('delete from ' + nome_tabela) self.db_test.commit()
class LDA_DB(): FILENAME = 'lda.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = { 'max_co_topic_count' : 10000 # Number of topic pairs to store } def __init__(self, path = None, isInit = False): self.isInit = isInit if path is not None: self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path) else: self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit) def __enter__(self): self.DefineOptionsTable() self.DefineDimensionTables() self.DefineMatrixTables() self.DefineStatsTables() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption( key, value, overwrite = self.isInit ) def SetOption(self, key, value, overwrite = True): where = self.db.options.key == key if self.db( where ).count() > 0: if overwrite: self.db( where ).update( value = value ) else: self.db.options.insert( key = key, value = value ) self.db.commit() def GetOption(self, key): where = self.db.options.key == key keyValue = self.db( where ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None ################################################################################ def DefineDimensionTables(self): self.db.define_table( 'terms', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text' , 'string' , required = True, unique = True ), Field( 'term_freq' , 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_text ON terms (term_text);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS terms_freq ON terms (term_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS terms_rank ON terms (rank);' ) self.db.define_table( 'docs', Field( 'doc_index', 'integer', required = True, unique = True, default = -1 ), Field( 'doc_id' , 'string' , required = True, unique = True ), Field( 'doc_freq' , 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_id ON docs (doc_id);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS docs_freq ON docs (doc_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS docs_rank ON docs (rank);' ) self.db.define_table( 'topics', Field( 'topic_index', 'integer' , required = True, unique = True, default = -1 ), Field( 'topic_freq' , 'double' , required = True ), Field( 'topic_label', 'string' , required = True ), Field( 'topic_desc' , 'string' , required = True ), Field( 'top_terms' , 'list:integer', required = True ), Field( 'top_docs' , 'list:integer', required = True ), Field( 'rank' , 'integer' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topics_index ON topics (topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_freq ON topics (topic_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_rank ON topics (rank);' ) def DefineMatrixTables(self): self.db.define_table( 'term_topic_matrix', Field( 'term_index' , 'integer', required = True, default = -1 ), Field( 'topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (term_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_value ON term_topic_matrix (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_rank ON term_topic_matrix (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_termindex ON term_topic_matrix (term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);' ) self.db.define_table( 'doc_topic_matrix', Field( 'doc_index' , 'integer', required = True, default = -1 ), Field( 'topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (doc_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_value ON doc_topic_matrix (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_rank ON doc_topic_matrix (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_docindex ON doc_topic_matrix (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);' ) def DefineStatsTables(self): self.db.define_table( 'topic_covariance', Field( 'first_topic_index', 'integer', required = True, default = -1 ), Field( 'second_topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_covariance_indexes ON topic_covariance (first_topic_index, second_topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_value ON topic_covariance (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_rank ON topic_covariance (rank);' ) ################################################################################ def Reset(self): self.db.executesql( 'DELETE FROM terms;' ) self.db.executesql( 'DELETE FROM docs;' ) self.db.executesql( 'DELETE FROM topics;' ) self.db.executesql( 'DELETE FROM term_topic_matrix;' ) self.db.executesql( 'DELETE FROM doc_topic_matrix;' ) self.db.executesql( 'DELETE FROM topic_covariance;' )
class Corpus_DB(): FILENAME = 'corpus.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DOC_IDS = ['doc_id', 'docid'] DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body'] MODEL_KEY = 'corpus' MODEL_DESC = 'Text Corpus' MODEL_ENTRY = { 'model_key' : MODEL_KEY, 'model_desc' : MODEL_DESC } LINEBREAKS_TABS = re.compile(r'[\t\r\n\f]') DEFAULT_OPTIONS = { 'token_regex' : r'\w{3,}', # Tokenize a corpus into a bag-of-words language model 'min_freq' : 5, # Number of times a term must appear in the corpus 'min_doc_freq' : 3 # Number of documents in which a terms must appear } def __init__(self, path = None, isInit = False): self.isInit = isInit if path is not None: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path) else: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit) def __enter__(self): self.DefineOptionsTable() self.DefineModelsTable() self.DefineCorpusTable() self.DefineMetadataTables() return self def __exit__(self, type, value, traceback): self.DefineCorpusTextSearch() self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption( key, value, overwrite = self.isInit ) def SetOption(self, key, value, overwrite = True): where = self.db.options.key == key if self.db( where ).count() > 0: if overwrite: self.db( where ).update( value = value ) else: self.db.options.insert( key = key, value = value ) self.db.commit() def GetOption(self, key): where = self.db.options.key == key keyValue = self.db( where ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None def DefineModelsTable(self): self.db.define_table( 'models', Field( 'model_key' , 'string', required = True, unique = True ), Field( 'model_desc', 'string', required = True ), migrate = self.isInit ) def AddModel(self, model_key, model_desc): where = self.db.models.model_key == model_key if self.db( where ).count() > 0: self.db( where ).update( model_desc = model_desc ) else: self.db.models.insert( model_key = model_key, model_desc = model_desc ) self.db.commit() def GetModel(self, model_key): if model_key == Corpus_DB.MODEL_KEY: return Corpus_DB.MODEL_ENTRY where = self.db.models.model_key == model_key keyValue = self.db( where ).select( self.db.models.ALL ).first() if keyValue: return { 'model_key' : keyValue.model_key, 'model_desc' : keyValue.model_desc } else: return None def GetModels(self): rows = self.db( self.db.models ).select( self.db.models.model_key, self.db.models.model_desc ).as_list() return [ Corpus_DB.MODEL_ENTRY ] + rows ################################################################################ def DefineCorpusTable(self): self.db.define_table( 'corpus', Field( 'doc_index' , 'integer', required = True, unique = True, default = -1 ), Field( 'doc_id' , 'string' , required = True, unique = True ), Field( 'doc_content', 'text' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id ON corpus (doc_id);' ) def DefineMetadataTables(self): self.db.define_table( 'fields', Field( 'field_index', 'integer', required = True, unique = True, default = -1 ), Field( 'field_name' , 'string' , required = True, unique = True ), Field( 'field_type' , 'string' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name ON fields (field_name);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS field_type ON fields (field_type);' ) self.db.define_table( 'metadata', Field( 'doc_index' , 'integer', required = True, default = -1 ), Field( 'field_index', 'integer', required = True, default = -1 ), Field( 'value' , 'text' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes ON metadata (doc_index, field_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_doc_index ON metadata (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);' ) ################################################################################ def DefineCorpusTextSearch(self): if self.isInit: self.db.executesql( 'DROP TABLE IF EXISTS corpus_search;' ) self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' ) self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' ) ################################################################################ def SanitizeText(self, text): text = Corpus_DB.LINEBREAKS_TABS.sub(u' ', text).strip() return text def ImportFromFile(self, filename): """ filename = A plain-text file (utf-8 encoded) containing one document per line """ def ReadFile(): with open(filename, 'r') as f: for index, line in enumerate(f): doc_index = index values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') if len(values) == 1: doc_id = 'doc{}'.format(doc_index+1) doc_content = values[0] else: doc_id = values[0] doc_content = values[1] yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8', 'ignore') } self.db.corpus.bulk_insert(ReadFile()) def ImportFromFolder(self, glob_pattern): """ glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file) """ def ReadFolder(): filenames = sorted(glob.glob(glob_pattern)) for index, filename in enumerate(filenames): doc_index = index doc_id = filename with open(filename, 'r') as f: doc_content = f.read().decode('utf-8', 'ignore') yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8', 'ignore') } self.db.corpus.bulk_insert(ReadFolder()) def ImportFromSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False): """ filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names) id_key = Name of the column containing unique document IDs content_key = Name of the column containing the document contents """ doc_id_keys = frozenset([id_key] if id_key is not None else Corpus_DB.DOC_IDS) doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS) field_indexes = [] field_names = [] field_types = [] metadata = [] def ReadCSV(): with open(filename, 'r') as f: reader = UnicodeReader(f) for row in reader: yield row def ReadTSV(): with open(filename, 'r') as f: for line in f: yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t') def ReadSpreadsheet(reader): field_doc_id = None field_doc_content = None for row_index, values in enumerate(reader): if row_index == 0: for index, field in enumerate(values): if field.lower() in doc_id_keys: field_doc_id = index elif field.lower() in doc_content_keys: field_doc_content = index else: field_index = len(field_indexes) field_indexes.append(field_index) field_names.append(field) field_types.append('integer') else: doc_index = row_index - 1 doc_id = 'doc{:d}'.format(doc_index+1) doc_content = '' field_index = 0 for index, value in enumerate(values): if field_doc_id == index: doc_id = value elif field_doc_content == index: doc_content = value else: metadata.append({ 'doc_index' : doc_index, 'field_index' : field_index, 'value' : value.encode('utf-8', 'ignore') }) # [START] infer field type field_type = field_types[field_index] if field_type == 'integer': try: int(value) except ValueError: field_type = 'double' if field_type == 'double': try: float(value) except ValueError: field_type = 'string' field_types[field_index] = field_type # [END] infer field type field_index += 1 yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8', 'ignore') } def GetFields(): for field_index in field_indexes: field_name = field_names[field_index] field_type = field_types[field_index] yield { 'field_index' : field_index, 'field_name' : field_name, 'field_type' : field_type } reader = ReadCSV() if is_csv else ReadTSV() self.db.corpus.bulk_insert(ReadSpreadsheet(reader)) self.db.fields.bulk_insert(GetFields()) self.db.metadata.bulk_insert(metadata) def ExportToFile(self, filename): """ filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents """ def WriteFile(rows): with open(filename, 'w') as f: for row in rows: doc_id = row.doc_id doc_content = self.SanitizeText(row.doc_content.decode('utf-8')) f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8', 'ignore')) rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index) WriteFile(rows) def ExportToSpreadsheet(self, filename, is_csv = False): """ filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata """ field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby = self.db.fields.field_index) ] field_count = len(field_names) all_field_names = [ 'doc_id', 'doc_content' ] + field_names def WriteCSV(rows): with open(filename, 'w') as f: writer = UnicodeWriter(f) writer.writerow(all_field_names) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = self.SanitizeText(row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = self.SanitizeText(d.value.decode('utf-8')) all_values = [ doc_id, doc_content ] + values writer.writerow(all_values) def WriteTSV(rows): with open(filename, 'w') as f: f.write(u'{}\n'.format(u'\t'.join(all_field_names)).encode('utf-8', 'ignore')) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = self.SanitizeText(row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = self.SanitizeText(d.value.decode('utf-8')) all_values = [ doc_id, doc_content ] + values f.write(u'{}\n'.format(u'\t'.join(all_values)).encode('utf-8', 'ignore')) rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index) if is_csv: WriteCSV(rows) else: WriteTSV(rows)
class MultipleLDA_DB(): FILENAME = 'multiple_lda.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = { 'max_co_topic_count' : 10000 # Number of topic pairs to store } def __init__(self, path = None, isInit = False): self.isInit = isInit if path is not None: self.db = DAL(MultipleLDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path) else: self.db = DAL(MultipleLDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit) def __enter__(self): self.DefineOptionsTable() self.DefineDimensionTables() self.DefineMatrixTables() self.DefineStatsTables() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) if self.isInit: for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption( key, value ) def SetOption(self, key, value): where = self.db.options.key == key if self.db( where ).count() > 0: self.db( where ).update( value = value ) else: self.db.options.insert( key = key, value = value ) def GetOption(self, key): where = self.db.options.key == key keyValue = self.db( where ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None ################################################################################ def DefineDimensionTables(self): self.db.define_table( 'terms', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text' , 'string' , required = True, unique = True ), Field( 'term_freq' , 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_text ON terms (term_text);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS terms_freq ON terms (term_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS terms_rank ON terms (rank);' ) self.db.define_table( 'docs', Field( 'doc_index', 'integer', required = True, unique = True, default = -1 ), Field( 'doc_id' , 'string' , required = True, unique = True ), Field( 'doc_freq' , 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_id ON docs (doc_id);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS docs_freq ON docs (doc_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS docs_rank ON docs (rank);' ) self.db.define_table( 'topics', Field( 'entry_index', 'integer' , required = True, default = -1 ), Field( 'topic_index', 'integer' , required = True, default = -1 ), Field( 'topic_freq' , 'double' , required = True ), Field( 'topic_label', 'string' , required = True ), Field( 'topic_desc' , 'string' , required = True ), Field( 'top_terms' , 'list:integer', required = True ), Field( 'top_docs' , 'list:integer', required = True ), Field( 'rank' , 'integer' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topics_indexes ON topics (entry_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_freq ON topics (topic_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_rank ON topics (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_freqEntry ON topics (entry_index, topic_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_rankEntry ON topics (entry_index, rank);' ) def DefineMatrixTables(self): self.db.define_table( 'term_topic_matrix', Field( 'entry_index', 'integer', required = True, default = -1 ), Field( 'term_index' , 'integer', required = True, default = -1 ), Field( 'topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (entry_index, term_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_value ON term_topic_matrix (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_rank ON term_topic_matrix (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_termindex ON term_topic_matrix (term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_valueEntry ON term_topic_matrix (entry_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_rankEntry ON term_topic_matrix (entry_index, rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_termindexEntry ON term_topic_matrix (entry_index, term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_topicindexEntry ON term_topic_matrix (entry_index, topic_index);' ) self.db.define_table( 'doc_topic_matrix', Field( 'entry_index', 'integer', required = True, default = -1 ), Field( 'doc_index' , 'integer', required = True, default = -1 ), Field( 'topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (entry_index, doc_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_value ON doc_topic_matrix (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_rank ON doc_topic_matrix (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_docindex ON doc_topic_matrix (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_valueEntry ON doc_topic_matrix (entry_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_rankEntry ON doc_topic_matrix (entry_index, rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_docindexEntry ON doc_topic_matrix (entry_index, doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_topicindexEntry ON doc_topic_matrix (entry_index, topic_index);' ) def DefineStatsTables(self): self.db.define_table( 'topic_cossim', Field( 'first_entry_index' , 'integer', required = True, default = -1 ), Field( 'first_topic_index' , 'integer', required = True, default = -1 ), Field( 'second_entry_index', 'integer', required = True, default = -1 ), Field( 'second_topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_cossim_indexes ON topic_cossim (first_entry_index, first_topic_index, second_entry_index, second_topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_value ON topic_cossim (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_rank ON topic_cossim (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_valueFirst ON topic_cossim (first_entry_index, first_topic_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_rankFirst ON topic_cossim (first_entry_index, first_topic_index, rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_valueSecond ON topic_cossim (second_entry_index, second_topic_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cossim_rankSecond ON topic_cossim (second_entry_index, second_topic_index, rank);' ) self.db.define_table( 'topic_kldiv', Field( 'first_entry_index' , 'integer', required = True, default = -1 ), Field( 'first_topic_index' , 'integer', required = True, default = -1 ), Field( 'second_entry_index', 'integer', required = True, default = -1 ), Field( 'second_topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_kldiv_indexes ON topic_kldiv (first_entry_index, first_topic_index, second_entry_index, second_topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_value ON topic_kldiv (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_rank ON topic_kldiv (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_valueFirst ON topic_kldiv (first_entry_index, first_topic_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_rankFirst ON topic_kldiv (first_entry_index, first_topic_index, rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_valueSecond ON topic_kldiv (second_entry_index, second_topic_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_kldiv_rankSecond ON topic_kldiv (second_entry_index, second_topic_index, rank);' ) self.db.define_table( 'topic_rdp', Field( 'first_entry_index' , 'integer', required = True, default = -1 ), Field( 'first_topic_index' , 'integer', required = True, default = -1 ), Field( 'second_entry_index', 'integer', required = True, default = -1 ), Field( 'second_topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_rdp_indexes ON topic_rdp (first_entry_index, first_topic_index, second_entry_index, second_topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_value ON topic_rdp (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_rank ON topic_rdp (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_valueFirst ON topic_rdp (first_entry_index, first_topic_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_rankFirst ON topic_rdp (first_entry_index, first_topic_index, rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_valueSecond ON topic_rdp (second_entry_index, second_topic_index, value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_rdp_rankSecond ON topic_rdp (second_entry_index, second_topic_index, rank);' ) ################################################################################ def Reset(self): self.db.executesql( 'DELETE FROM terms;' ) self.db.executesql( 'DELETE FROM docs;' ) self.db.executesql( 'DELETE FROM topics;' ) self.db.executesql( 'DELETE FROM term_topic_matrix;' ) self.db.executesql( 'DELETE FROM doc_topic_matrix;' ) self.db.executesql( 'DELETE FROM topic_cossim;' ) self.db.executesql( 'DELETE FROM topic_kldiv;' ) self.db.executesql( 'DELETE FROM topic_rdp;' )
if create_tbl: createTable(table) engine_id["sqlite"] = db.engine.insert(ename="sqlite", description="SQLite", connection="sqlite://database.db") engine_id["mysql"] = db.engine.insert( ename="mysql", description="MySQL", connection="mysql://*****:*****@localhost/database" ) engine_id["postgres"] = db.engine.insert( ename="postgres", description="PostgreSQL", connection="postgres://*****:*****@localhost/database" ) engine_id["mssql"] = db.engine.insert( ename="mssql", description="MSSQL", connection="mssql2://username:password@localhost/database" ) engine_id["google_sql"] = db.engine.insert( ename="google_sql", description="Google SQL", connection="google:sql://project:instance/database" ) db.commit() table = db.define_table( "databases", Field("id", readable=False, writable=False), Field("alias", type="string", length=150, label=T("Alias"), notnull=True, required=True, unique=True), Field( "engine_id", db.engine, label=T("Engine"), notnull=True, required=True, requires=IS_IN_DB(db, db.engine.id, "%(description)s"), ), Field("host", type="string", length=150, label=T("InstanceID / Host")), Field("port", type="string", length=150, label=T("Port")),
class Corpus_DB(): FILENAME = 'corpus.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DOC_IDS = ['doc_id', 'docid'] DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body'] DEFAULT_OPTIONS = { 'token_regex': r'\w{3,}', 'min_freq': 5, 'min_doc_freq': 3, 'max_freq_count': 4000, 'max_co_freq_count': 160000, 'max_g2_count': 160000 } def __init__(self, path=None, isInit=False, isImport=False, isReset=False): self.isInit = isInit self.isImport = isImport self.isReset = isReset isInitOrImport = self.isInit or self.isImport if path is not None: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables=not isInitOrImport, migrate_enabled=isInitOrImport, folder=path) else: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables=not isInitOrImport, migrate_enabled=isInitOrImport) def __enter__(self): self.DefineOptionsTable() self.DefineModelsTable() self.DefineCorpusTable() self.DefineMetadataTables() if self.isReset: self.Reset() self.DefineTermStatsTables() self.DefineTermCoStatsTables() self.DefineSentenceCoStatsTables() self.DefineTemporaryTable() return self def __exit__(self, type, value, traceback): self.DefineCorpusTextSearch() self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table('options', Field('key', 'string', required=True, unique=True), Field('value', 'string', required=True), migrate=self.isImport) if self.isInit: for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems(): keyValue = self.db(self.db.options.key == key).select().first() if keyValue: keyValue.update_record(value=value) else: self.db.options.insert(key=key, value=value) def SetOption(self, key, value): keyValue = self.db(self.db.options.key == key).select().first() if keyValue: keyValue.update_record(value=value) else: self.db.options.insert(key=key, value=value) def GetOption(self, key): keyValue = self.db(self.db.options.key == key).select( self.db.options.value).first() if keyValue: return keyValue.value else: return None def DefineModelsTable(self): self.db.define_table('models', Field('model_key', 'string', required=True, unique=True), Field('model_desc', 'string', required=True), migrate=self.isImport) def AddModel(self, model_key, model_desc): keyDesc = self.db( self.db.models.model_key == model_key).select().first() if keyDesc: keyDesc.update_record(model_desc=model_desc) else: self.db.models.insert(model_key=model_key, model_desc=model_desc) def GetModels(self): models = self.db(self.db.models).select(self.db.models.model_key) return [model.model_key for model in models] def GetModelDescription(self, model_key): model = self.db(self.db.models.model_key == model_key).select().first() if model: return model.model_desc else: return None ################################################################################ def DefineCorpusTable(self): self.db.define_table('corpus', Field('doc_index', 'integer', required=True, unique=True, default=-1), Field('doc_id', 'string', required=True, unique=True), Field('doc_content', 'text', required=True), migrate=self.isImport) if self.isImport: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id ON corpus (doc_id);' ) def DefineMetadataTables(self): self.db.define_table('fields', Field('field_index', 'integer', required=True, unique=True, default=-1), Field('field_name', 'string', required=True, unique=True), Field('field_type', 'string', required=True), migrate=self.isImport) if self.isImport: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name ON fields (field_name);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS field_type ON fields (field_type);' ) self.db.define_table('metadata', Field('doc_index', 'integer', required=True, default=-1), Field('field_index', 'integer', required=True, default=-1), Field('value', 'text', required=True), migrate=self.isImport) if self.isImport: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes ON metadata (doc_index, field_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_doc_index ON metadata (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);' ) ################################################################################ def DefineCorpusTextSearch(self): if self.isImport: self.db.executesql('DROP TABLE IF EXISTS corpus_search;') self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' ) self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' ) ################################################################################ def ImportFromFile(self, filename): """ filename = A plain-text file (utf-8 encoded) containing one document per line """ def ReadFile(): with open(filename, 'r') as f: for index, line in enumerate(f): doc_index = index values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') if len(values) == 1: doc_id = 'doc{}'.format(doc_index + 1) doc_content = values[0] else: doc_id = values[0] doc_content = values[1] yield { 'doc_index': doc_index, 'doc_id': doc_id, 'doc_content': doc_content.encode('utf-8') } self.db.corpus.bulk_insert(ReadFile()) def ImportFromFolder(self, glob_pattern): """ glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file) """ def ReadFolder(): filenames = sorted(glob.glob(glob_pattern)) for index, filename in enumerate(filenames): doc_index = index doc_id = filename with open(filename, 'r') as f: doc_content = f.read().decode('utf-8', 'ignore') yield { 'doc_index': doc_index, 'doc_id': doc_id, 'doc_content': doc_content.encode('utf-8') } self.db.corpus.bulk_insert(ReadFolder()) def ImportFromSpreadsheet(self, filename, id_key=None, content_key=None, is_csv=False): """ filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names) id_key = Name of the column containing unique document IDs content_key = Name of the column containing the document contents """ doc_id_keys = frozenset( [id_key] if id_key is not None else Corpus_DB.DOC_IDS) doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS) field_indexes = [] field_names = [] field_types = [] metadata = [] def ReadCSV(): with open(filename, 'r') as f: reader = UnicodeReader(f) for row in reader: yield row def ReadTSV(): with open(filename, 'r') as f: for line in f: yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t') def ReadSpreadsheet(reader): field_doc_id = None field_doc_content = None for row_index, values in enumerate(reader): if row_index == 0: for index, field in enumerate(values): if field.lower() in doc_id_keys: field_doc_id = index elif field.lower() in doc_content_keys: field_doc_content = index else: field_index = len(field_indexes) field_indexes.append(field_index) field_names.append(field) field_types.append('integer') else: doc_index = row_index - 1 doc_id = 'doc{:d}'.format(doc_index + 1) doc_content = '' field_index = 0 for index, value in enumerate(values): if field_doc_id == index: doc_id = value elif field_doc_content == index: doc_content = value else: metadata.append({ 'doc_index': doc_index, 'field_index': field_index, 'value': value.encode('utf-8') }) # [START] infer field type field_type = field_types[field_index] if field_type == 'integer': try: int(value) except ValueError: field_type = 'double' if field_type == 'double': try: float(value) except ValueError: field_type = 'string' field_types[field_index] = field_type # [END] infer field type field_index += 1 yield { 'doc_index': doc_index, 'doc_id': doc_id, 'doc_content': doc_content.encode('utf-8') } def GetFields(): for field_index in field_indexes: field_name = field_names[field_index] field_type = field_types[field_index] yield { 'field_index': field_index, 'field_name': field_name, 'field_type': field_type } reader = ReadCSV() if is_csv else ReadTSV() self.db.corpus.bulk_insert(ReadSpreadsheet(reader)) self.db.fields.bulk_insert(GetFields()) self.db.metadata.bulk_insert(metadata) def ExportToFile(self, filename): """ filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents """ def WriteFile(rows): m = re.compile(r'\s+') with open(filename, 'w') as f: for row in rows: doc_id = row.doc_id doc_content = m.sub(u' ', row.doc_content.decode('utf-8')) f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8')) rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby=self.db.corpus.doc_index) WriteFile(rows) def ExportToSpreadsheet(self, filename, is_csv=False): """ filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata """ field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby=self.db.fields.field_index) ] field_count = len(field_names) all_field_names = ['doc_id', 'doc_content'] + field_names def WriteCSV(rows): m = re.compile(r'\s+') with open(filename, 'w') as f: writer = UnicodeWriter(f) writer.writerow(all_field_names) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = m.sub(u' ', row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db( self.db.metadata.doc_index == doc_index).select( self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = m.sub(u' ', d.value.decode('utf-8')) all_values = [doc_id, doc_content] + values writer.writerow(all_values) def WriteTSV(rows): m = re.compile(r'\s+') with open(filename, 'w') as f: f.write(u'{}\n'.format( u'\t'.join(all_field_names)).encode('utf-8')) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = m.sub(u' ', row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db( self.db.metadata.doc_index == doc_index).select( self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = m.sub(u' ', d.value.decode('utf-8')) all_values = [doc_id, doc_content] + values f.write(u'{}\n'.format( u'\t'.join(all_values)).encode('utf-8')) rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby=self.db.corpus.doc_index) if is_csv: WriteCSV(rows) else: WriteTSV(rows) ################################################################################ def DefineTermStatsTables(self): self.db.define_table('term_texts', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('term_text', 'string', required=True, unique=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);' ) self.db.define_table('term_freqs', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);' ) self.db.define_table('term_probs', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);' ) self.db.define_table('term_doc_freqs', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);' ) def DefineTermCoStatsTables(self): self.db.define_table('term_co_freqs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);' ) self.db.define_table('term_co_probs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);' ) self.db.define_table('term_g2', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);') self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);') def DefineSentenceCoStatsTables(self): self.db.define_table('sentences_co_freqs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);' ) self.db.define_table('sentences_co_probs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);' ) self.db.define_table('sentences_g2', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);' ) ################################################################################ def Reset(self): self.db.executesql('DELETE FROM term_texts;') self.db.executesql('DELETE FROM term_freqs;') self.db.executesql('DELETE FROM term_probs;') self.db.executesql('DELETE FROM term_doc_freqs;') self.db.executesql('DELETE FROM term_co_freqs;') self.db.executesql('DELETE FROM term_co_probs;') self.db.executesql('DELETE FROM term_g2;') self.db.executesql('DELETE FROM sentences_co_freqs;') self.db.executesql('DELETE FROM sentences_co_probs;') self.db.executesql('DELETE FROM sentences_g2;') ################################################################################ def DefineTemporaryTable(self): self.db.define_table('vocab', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('term_text', 'string', required=True), migrate=self.isInit) self.db.define_table('vocab_text', Field('term_text', 'string', required=True, unique=True, default=-1), migrate=self.isInit)
class Corpus_DB(): FILENAME = 'corpus.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DOC_IDS = ['doc_id', 'docid'] DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body'] MODEL_KEY = 'corpus' MODEL_DESC = 'Text Corpus' MODEL_ENTRY = { 'model_key' : MODEL_KEY, 'model_desc' : MODEL_DESC } LINEBREAKS_TABS = re.compile(r'[\t\r\n\f]') DEFAULT_OPTIONS = { 'token_regex' : r'\w{3,}', # Tokenize a corpus into a bag-of-words language model 'min_freq' : 5, # Number of times a term must appear in the corpus 'min_doc_freq' : 3 # Number of documents in which a terms must appear } def __init__(self, path = None, isInit = False): self.isInit = isInit if path is not None: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path) else: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit) def __enter__(self): self.DefineOptionsTable() self.DefineModelsTable() self.DefineCorpusTable() self.DefineMetadataTables() return self def __exit__(self, type, value, traceback): self.DefineCorpusTextSearch() self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption( key, value, overwrite = self.isInit ) def SetOption(self, key, value, overwrite = True): where = self.db.options.key == key if self.db( where ).count() > 0: if overwrite: self.db( where ).update( value = value ) else: self.db.options.insert( key = key, value = value ) self.db.commit() def GetOption(self, key): where = self.db.options.key == key keyValue = self.db( where ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None def DefineModelsTable(self): self.db.define_table( 'models', Field( 'model_key' , 'string', required = True, unique = True ), Field( 'model_desc', 'string', required = True ), migrate = self.isInit ) def AddModel(self, model_key, model_desc): where = self.db.models.model_key == model_key if self.db( where ).count() > 0: self.db( where ).update( model_desc = model_desc ) else: self.db.models.insert( model_key = model_key, model_desc = model_desc ) self.db.commit() def GetModel(self, model_key): if model_key == Corpus_DB.MODEL_KEY: return Corpus_DB.MODEL_ENTRY where = self.db.models.model_key == model_key keyValue = self.db( where ).select( self.db.models.ALL ).first() if keyValue: return { 'model_key' : keyValue.model_key, 'model_desc' : keyValue.model_desc } else: return None def GetModels(self): rows = self.db( self.db.models ).select( self.db.models.model_key, self.db.models.model_desc ).as_list() return [ Corpus_DB.MODEL_ENTRY ] + rows ################################################################################ def DefineCorpusTable(self): self.db.define_table( 'corpus', Field( 'doc_index' , 'integer', required = True, unique = True, default = -1 ), Field( 'doc_id' , 'string' , required = True, unique = True ), Field( 'doc_content', 'text' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id ON corpus (doc_id);' ) def DefineMetadataTables(self): self.db.define_table( 'fields', Field( 'field_index', 'integer', required = True, unique = True, default = -1 ), Field( 'field_name' , 'string' , required = True, unique = True ), Field( 'field_type' , 'string' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name ON fields (field_name);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS field_type ON fields (field_type);' ) self.db.define_table( 'metadata', Field( 'doc_index' , 'integer', required = True, default = -1 ), Field( 'field_index', 'integer', required = True, default = -1 ), Field( 'value' , 'text' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes ON metadata (doc_index, field_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_doc_index ON metadata (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);' ) ################################################################################ def DefineCorpusTextSearch(self): if self.isInit: pass # self.db.executesql( 'DROP TABLE IF EXISTS corpus_search;' ) # self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' ) # self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' ) ################################################################################ def SanitizeText(self, text): text = Corpus_DB.LINEBREAKS_TABS.sub(u' ', text).strip() return text def ImportFromFile(self, filename): """ filename = A plain-text file (utf-8 encoded) containing one document per line """ def ReadFile(): with open(filename, 'r') as f: for index, line in enumerate(f): doc_index = index values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') if len(values) == 1: doc_id = 'doc{}'.format(doc_index+1) doc_content = values[0] else: doc_id = values[0] doc_content = values[1] yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8', 'ignore') } self.db.corpus.bulk_insert(ReadFile()) def ImportFromFolder(self, glob_pattern): """ glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file) """ def ReadFolder(): filenames = sorted(glob.glob(glob_pattern)) for index, filename in enumerate(filenames): doc_index = index doc_id = filename with open(filename, 'r') as f: doc_content = f.read().decode('utf-8', 'ignore') yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8', 'ignore') } self.db.corpus.bulk_insert(ReadFolder()) def ImportFromSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False): """ filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names) id_key = Name of the column containing unique document IDs content_key = Name of the column containing the document contents """ doc_id_keys = frozenset([id_key] if id_key is not None else Corpus_DB.DOC_IDS) doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS) field_indexes = [] field_names = [] field_types = [] metadata = [] def ReadCSV(): with open(filename, 'r') as f: reader = UnicodeReader(f) for row in reader: yield row def ReadTSV(): with open(filename, 'r') as f: for line in f: yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t') def ReadSpreadsheet(reader): field_doc_id = None field_doc_content = None for row_index, values in enumerate(reader): if row_index == 0: for index, field in enumerate(values): if field.lower() in doc_id_keys: field_doc_id = index elif field.lower() in doc_content_keys: field_doc_content = index else: field_index = len(field_indexes) field_indexes.append(field_index) field_names.append(field) field_types.append('integer') else: doc_index = row_index - 1 doc_id = 'doc{:d}'.format(doc_index+1) doc_content = '' field_index = 0 for index, value in enumerate(values): if field_doc_id == index: doc_id = value elif field_doc_content == index: doc_content = value else: metadata.append({ 'doc_index' : doc_index, 'field_index' : field_index, 'value' : value.encode('utf-8', 'ignore') }) # [START] infer field type field_type = field_types[field_index] if field_type == 'integer': try: int(value) except ValueError: field_type = 'double' if field_type == 'double': try: float(value) except ValueError: field_type = 'string' field_types[field_index] = field_type # [END] infer field type field_index += 1 yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8', 'ignore') } def GetFields(): for field_index in field_indexes: field_name = field_names[field_index] field_type = field_types[field_index] yield { 'field_index' : field_index, 'field_name' : field_name, 'field_type' : field_type } reader = ReadCSV() if is_csv else ReadTSV() self.db.corpus.bulk_insert(ReadSpreadsheet(reader)) self.db.fields.bulk_insert(GetFields()) self.db.metadata.bulk_insert(metadata) def ExportToFile(self, filename): """ filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents """ def WriteFile(rows): with open(filename, 'w') as f: for row in rows: doc_id = row.doc_id doc_content = self.SanitizeText(row.doc_content.decode('utf-8')) f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8', 'ignore')) rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index) WriteFile(rows) def ExportToSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False): """ filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata """ field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby = self.db.fields.field_index) ] field_count = len(field_names) all_field_names = [ id_key if id_key is not None else 'doc_id', content_key if content_key is not None else 'doc_content' ] + field_names def WriteCSV(rows): with open(filename, 'w') as f: writer = UnicodeWriter(f) writer.writerow(all_field_names) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = self.SanitizeText(row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = self.SanitizeText(d.value.decode('utf-8')) all_values = [ doc_id, doc_content ] + values writer.writerow(all_values) def WriteTSV(rows): with open(filename, 'w') as f: f.write(u'{}\n'.format(u'\t'.join(all_field_names)).encode('utf-8', 'ignore')) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = self.SanitizeText(row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = self.SanitizeText(d.value.decode('utf-8')) all_values = [ doc_id, doc_content ] + values f.write(u'{}\n'.format(u'\t'.join(all_values)).encode('utf-8', 'ignore')) rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index) if is_csv: WriteCSV(rows) else: WriteTSV(rows)
def test_DAL_person_table(): conn_str = "postgres://*****:*****@tornado/joseph" db = DAL(conn_str, check_reserved=['postgres']) db.define_table('person', Field('name')) db.person.insert(name="Alex") db.commit()
ename="mysql", description="MySQL", connection="mysql://*****:*****@localhost/database") engine_id["postgres"] = db.engine.insert( ename="postgres", description="PostgreSQL", connection="postgres://*****:*****@localhost/database") engine_id["mssql"] = db.engine.insert( ename="mssql", description="MSSQL", connection="mssql2://username:password@localhost/database") engine_id["google_sql"] = db.engine.insert( ename="google_sql", description="Google SQL", connection="google:sql://project:instance/database") db.commit() table = db.define_table( 'databases', Field('id', readable=False, writable=False), Field('alias', type='string', length=150, label=T('Alias'), notnull=True, required=True, unique=True), Field('engine_id', db.engine, label=T('Engine'), notnull=True, required=True,
class LDA_DB(): FILENAME = 'lda.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = { 'max_co_topic_count' : 40000 } def __init__(self, path = None, isInit = False, isReset = False): self.isInit = isInit self.isReset = isReset if path is not None: self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit, folder = path) else: self.db = DAL(LDA_DB.CONNECTION, lazy_tables = not self.isInit, migrate = self.isInit) def __enter__(self): self.DefineOptionsTable() if self.isReset: self.Reset() self.DefineDimensionTables() self.DefineMatrixTables() self.DefineStatsTables() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) if self.db(self.db.options).count() == 0: for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems(): self.db.options.insert( key = key, value = value ) def SetOption(self, key, value): keyValue = self.db( self.db.options.key == key ).select().first() if keyValue: keyValue.update_record( value = value ) else: self.db.options.insert( key = key, value = value ) def GetOption(self, key): keyValue = self.db( self.db.options.key == key ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None ################################################################################ def DefineDimensionTables(self): self.db.define_table( 'terms', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text' , 'string' , required = True, unique = True ), Field( 'term_freq' , 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS terms_text ON terms (term_text);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS terms_freq ON terms (term_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS terms_rank ON terms (rank);' ) self.db.define_table( 'docs', Field( 'doc_index', 'integer', required = True, unique = True, default = -1 ), Field( 'doc_id' , 'string' , required = True, unique = True ), Field( 'doc_freq' , 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS docs_id ON docs (doc_id);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS docs_freq ON docs (doc_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS docs_rank ON docs (rank);' ) self.db.define_table( 'topics', Field( 'topic_index', 'integer' , required = True, unique = True, default = -1 ), Field( 'topic_freq' , 'double' , required = True ), Field( 'topic_label', 'string' , required = True ), Field( 'topic_desc' , 'string' , required = True ), Field( 'top_terms' , 'list:integer', required = True ), Field( 'top_docs' , 'list:integer', required = True ), Field( 'rank' , 'integer' , required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topics_index ON topics (topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_freq ON topics (topic_freq);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topics_rank ON topics (rank);' ) def DefineMatrixTables(self): self.db.define_table( 'term_topic_matrix', Field( 'term_index' , 'integer', required = True, default = -1 ), Field( 'topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (term_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_value ON term_topic_matrix (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_rank ON term_topic_matrix (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_termindex ON term_topic_matrix (term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);' ) self.db.define_table( 'doc_topic_matrix', Field( 'doc_index' , 'integer', required = True, default = -1 ), Field( 'topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (doc_index, topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_value ON doc_topic_matrix (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_rank ON doc_topic_matrix (rank);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_docindex ON doc_topic_matrix (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);' ) def DefineStatsTables(self): self.db.define_table( 'topic_cooccurrences', Field( 'first_topic_index' , 'integer', required = True, default = -1 ), Field( 'second_topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_cooccurrences_indexes ON topic_cooccurrences (first_topic_index, second_topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cooccurrences_value ON topic_cooccurrences (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_cooccurrences_rank ON topic_cooccurrences (rank);' ) self.db.define_table( 'topic_covariance', Field( 'first_topic_index', 'integer', required = True, default = -1 ), Field( 'second_topic_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS topic_covariance_indexes ON topic_covariance (first_topic_index, second_topic_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_value ON topic_covariance (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS topic_covariance_rank ON topic_covariance (rank);' ) ################################################################################ def Reset(self): self.db.executesql( 'DELETE FROM terms;' ) self.db.executesql( 'DELETE FROM docs;' ) self.db.executesql( 'DELETE FROM topics;' ) self.db.executesql( 'DELETE FROM term_topic_matrix;' ) self.db.executesql( 'DELETE FROM doc_topic_matrix;' ) self.db.executesql( 'DELETE FROM topic_cooccurrences;' ) self.db.executesql( 'DELETE FROM topic_covariance;' )
class LDA_DB: FILENAME = "lda.db" CONNECTION = "sqlite://{}".format(FILENAME) DEFAULT_OPTIONS = {"max_co_topic_count": 40000} def __init__(self, path=None, isInit=False, isReset=False): self.isInit = isInit self.isReset = isReset if path is not None: self.db = DAL(LDA_DB.CONNECTION, lazy_tables=not self.isInit, migrate=self.isInit, folder=path) else: self.db = DAL(LDA_DB.CONNECTION, lazy_tables=not self.isInit, migrate=self.isInit) def __enter__(self): self.DefineOptionsTable() if self.isReset: self.Reset() self.DefineDimensionTables() self.DefineMatrixTables() self.DefineStatsTables() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( "options", Field("key", "string", required=True, unique=True), Field("value", "string", required=True), migrate=self.isInit, ) if self.db(self.db.options).count() == 0: for key, value in LDA_DB.DEFAULT_OPTIONS.iteritems(): self.db.options.insert(key=key, value=value) def SetOption(self, key, value): keyValue = self.db(self.db.options.key == key).select().first() if keyValue: keyValue.update_record(value=value) else: self.db.options.insert(key=key, value=value) def GetOption(self, key): keyValue = self.db(self.db.options.key == key).select(self.db.options.value).first() if keyValue: return keyValue.value else: return None ################################################################################ def DefineDimensionTables(self): self.db.define_table( "terms", Field("term_index", "integer", required=True, unique=True, default=-1), Field("term_text", "string", required=True, unique=True), Field("term_freq", "double", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS terms_index ON terms (term_index);") self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS terms_text ON terms (term_text);") self.db.executesql("CREATE INDEX IF NOT EXISTS terms_freq ON terms (term_freq);") self.db.executesql("CREATE INDEX IF NOT EXISTS terms_rank ON terms (rank);") self.db.define_table( "docs", Field("doc_index", "integer", required=True, unique=True, default=-1), Field("doc_id", "string", required=True, unique=True), Field("doc_freq", "double", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS docs_index ON docs (doc_index);") self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS docs_id ON docs (doc_id);") self.db.executesql("CREATE INDEX IF NOT EXISTS docs_freq ON docs (doc_freq);") self.db.executesql("CREATE INDEX IF NOT EXISTS docs_rank ON docs (rank);") self.db.define_table( "topics", Field("topic_index", "integer", required=True, unique=True, default=-1), Field("topic_freq", "double", required=True), Field("topic_label", "string", required=True), Field("topic_desc", "string", required=True), Field("top_terms", "list:integer", required=True), Field("top_docs", "list:integer", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql("CREATE UNIQUE INDEX IF NOT EXISTS topics_index ON topics (topic_index);") self.db.executesql("CREATE INDEX IF NOT EXISTS topics_freq ON topics (topic_freq);") self.db.executesql("CREATE INDEX IF NOT EXISTS topics_rank ON topics (rank);") def DefineMatrixTables(self): self.db.define_table( "term_topic_matrix", Field("term_index", "integer", required=True, default=-1), Field("topic_index", "integer", required=True, default=-1), Field("value", "double", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql( "CREATE UNIQUE INDEX IF NOT EXISTS term_topic_indexes ON term_topic_matrix (term_index, topic_index);" ) self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_value ON term_topic_matrix (value);") self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_rank ON term_topic_matrix (rank);") self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_termindex ON term_topic_matrix (term_index);") self.db.executesql("CREATE INDEX IF NOT EXISTS term_topic_topicindex ON term_topic_matrix (topic_index);") self.db.define_table( "doc_topic_matrix", Field("doc_index", "integer", required=True, default=-1), Field("topic_index", "integer", required=True, default=-1), Field("value", "double", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql( "CREATE UNIQUE INDEX IF NOT EXISTS doc_topic_indexes ON doc_topic_matrix (doc_index, topic_index);" ) self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_value ON doc_topic_matrix (value);") self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_rank ON doc_topic_matrix (rank);") self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_docindex ON doc_topic_matrix (doc_index);") self.db.executesql("CREATE INDEX IF NOT EXISTS doc_topic_topicindex ON doc_topic_matrix (topic_index);") def DefineStatsTables(self): self.db.define_table( "topic_cooccurrences", Field("first_topic_index", "integer", required=True, default=-1), Field("second_topic_index", "integer", required=True, default=-1), Field("value", "double", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql( "CREATE UNIQUE INDEX IF NOT EXISTS topic_cooccurrences_indexes ON topic_cooccurrences (first_topic_index, second_topic_index);" ) self.db.executesql("CREATE INDEX IF NOT EXISTS topic_cooccurrences_value ON topic_cooccurrences (value);") self.db.executesql("CREATE INDEX IF NOT EXISTS topic_cooccurrences_rank ON topic_cooccurrences (rank);") self.db.define_table( "topic_covariance", Field("first_topic_index", "integer", required=True, default=-1), Field("second_topic_index", "integer", required=True, default=-1), Field("value", "double", required=True), Field("rank", "integer", required=True), migrate=self.isInit, ) if self.isInit: self.db.executesql( "CREATE UNIQUE INDEX IF NOT EXISTS topic_covariance_indexes ON topic_covariance (first_topic_index, second_topic_index);" ) self.db.executesql("CREATE INDEX IF NOT EXISTS topic_covariance_value ON topic_covariance (value);") self.db.executesql("CREATE INDEX IF NOT EXISTS topic_covariance_rank ON topic_covariance (rank);") ################################################################################ def Reset(self): self.db.executesql("DELETE FROM terms;") self.db.executesql("DELETE FROM docs;") self.db.executesql("DELETE FROM topics;") self.db.executesql("DELETE FROM term_topic_matrix;") self.db.executesql("DELETE FROM doc_topic_matrix;") self.db.executesql("DELETE FROM topic_cooccurrences;") self.db.executesql("DELETE FROM topic_covariance;")
class BOW_DB(): FILENAME = 'bow.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = { 'max_freq_count': 4000, # Maximum number of terms to store 'max_co_freq_count': 100000 # Maximum number of term pairs to store } def __init__(self, path=None, isInit=False): self.isInit = isInit if path is not None: self.db = DAL(BOW_DB.CONNECTION, lazy_tables=not self.isInit, migrate_enabled=self.isInit, folder=path) else: self.db = DAL(BOW_DB.CONNECTION, lazy_tables=not self.isInit, migrate_enabled=self.isInit) def __enter__(self): self.DefineOptionsTable() self.DefineTermStatsTables() self.DefineTermCoStatsTables() self.DefineSentenceCoStatsTables() self.DefineTemporaryTable() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table('options', Field('key', 'string', required=True, unique=True), Field('value', 'string', required=True), migrate=self.isInit) for key, value in BOW_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption(key, value, overwrite=self.isInit) def SetOption(self, key, value, overwrite=True): where = self.db.options.key == key if self.db(where).count() > 0: if overwrite: self.db(where).update(value=value) else: self.db.options.insert(key=key, value=value) self.db.commit() def GetOption(self, key): where = self.db.options.key == key keyValue = self.db(where).select(self.db.options.value).first() if keyValue: return keyValue.value else: return None ################################################################################ def DefineTermStatsTables(self): self.db.define_table('term_texts', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('term_text', 'string', required=True, unique=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);' ) self.db.define_table('term_freqs', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);' ) self.db.define_table('term_probs', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);' ) self.db.define_table('term_doc_freqs', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);' ) def DefineTermCoStatsTables(self): self.db.define_table('term_co_freqs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);' ) self.db.define_table('term_co_probs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);' ) self.db.define_table('term_g2', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);') self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);') def DefineSentenceCoStatsTables(self): self.db.define_table('sentences_co_freqs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);' ) self.db.define_table('sentences_co_probs', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);' ) self.db.define_table('sentences_g2', Field('first_term_index', 'integer', required=True, default=-1), Field('second_term_index', 'integer', required=True, default=-1), Field('value', 'double', required=True), Field('rank', 'integer', required=True), migrate=self.isInit) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);' ) ################################################################################ def Reset(self): self.db.executesql('DELETE FROM term_texts;') self.db.executesql('DELETE FROM term_freqs;') self.db.executesql('DELETE FROM term_probs;') self.db.executesql('DELETE FROM term_doc_freqs;') self.db.executesql('DELETE FROM term_co_freqs;') self.db.executesql('DELETE FROM term_co_probs;') self.db.executesql('DELETE FROM term_g2;') self.db.executesql('DELETE FROM sentences_co_freqs;') self.db.executesql('DELETE FROM sentences_co_probs;') self.db.executesql('DELETE FROM sentences_g2;') ################################################################################ def DefineTemporaryTable(self): self.db.define_table('vocab', Field('term_index', 'integer', required=True, unique=True, default=-1), Field('term_text', 'string', required=True), migrate=self.isInit) self.db.define_table('vocab_text', Field('term_text', 'string', required=True, unique=True, default=-1), migrate=self.isInit)
class Corpus_DB(): FILENAME = 'corpus.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DOC_IDS = ['doc_id', 'docid'] DOC_CONTENTS = ['doc_content', 'doccontent', 'docbody', 'doc_body'] DEFAULT_OPTIONS = { 'token_regex' : r'\w{3,}', 'min_freq' : 5, 'min_doc_freq' : 3, 'max_freq_count' : 4000, 'max_co_freq_count' : 160000, 'max_g2_count' : 160000 } def __init__(self, path = None, isInit = False, isImport = False, isReset = False): self.isInit = isInit self.isImport = isImport self.isReset = isReset isInitOrImport = self.isInit or self.isImport if path is not None: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not isInitOrImport, migrate_enabled = isInitOrImport, folder = path) else: self.db = DAL(Corpus_DB.CONNECTION, lazy_tables = not isInitOrImport, migrate_enabled = isInitOrImport) def __enter__(self): self.DefineOptionsTable() self.DefineModelsTable() self.DefineCorpusTable() self.DefineMetadataTables() if self.isReset: self.Reset() self.DefineTermStatsTables() self.DefineTermCoStatsTables() self.DefineSentenceCoStatsTables() self.DefineTemporaryTable() return self def __exit__(self, type, value, traceback): self.DefineCorpusTextSearch() self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isImport ) if self.isInit: for key, value in Corpus_DB.DEFAULT_OPTIONS.iteritems(): keyValue = self.db( self.db.options.key == key ).select().first() if keyValue: keyValue.update_record( value = value ) else: self.db.options.insert( key = key, value = value ) def SetOption(self, key, value): keyValue = self.db( self.db.options.key == key ).select().first() if keyValue: keyValue.update_record( value = value ) else: self.db.options.insert( key = key, value = value ) def GetOption(self, key): keyValue = self.db( self.db.options.key == key ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None def DefineModelsTable(self): self.db.define_table( 'models', Field( 'model_key' , 'string', required = True, unique = True ), Field( 'model_desc', 'string', required = True ), migrate = self.isImport ) def AddModel(self, model_key, model_desc): keyDesc = self.db( self.db.models.model_key == model_key ).select().first() if keyDesc: keyDesc.update_record( model_desc = model_desc ) else: self.db.models.insert( model_key = model_key, model_desc = model_desc ) def GetModels(self): models = self.db( self.db.models ).select( self.db.models.model_key ) return [ model.model_key for model in models ] def GetModelDescription(self, model_key): model = self.db( self.db.models.model_key == model_key ).select().first() if model: return model.model_desc else: return None ################################################################################ def DefineCorpusTable(self): self.db.define_table( 'corpus', Field( 'doc_index' , 'integer', required = True, unique = True, default = -1 ), Field( 'doc_id' , 'string' , required = True, unique = True ), Field( 'doc_content', 'text' , required = True ), migrate = self.isImport ) if self.isImport: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_index ON corpus (doc_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS corpus_doc_id ON corpus (doc_id);' ) def DefineMetadataTables(self): self.db.define_table( 'fields', Field( 'field_index', 'integer', required = True, unique = True, default = -1 ), Field( 'field_name' , 'string' , required = True, unique = True ), Field( 'field_type' , 'string' , required = True ), migrate = self.isImport ) if self.isImport: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_index ON fields (field_index);' ) self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS field_name ON fields (field_name);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS field_type ON fields (field_type);' ) self.db.define_table( 'metadata', Field( 'doc_index' , 'integer', required = True, default = -1 ), Field( 'field_index', 'integer', required = True, default = -1 ), Field( 'value' , 'text' , required = True ), migrate = self.isImport ) if self.isImport: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS metadata_indexes ON metadata (doc_index, field_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_doc_index ON metadata (doc_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS metadata_field_index ON metadata (field_index);' ) ################################################################################ def DefineCorpusTextSearch(self): if self.isImport: self.db.executesql( 'DROP TABLE IF EXISTS corpus_search;' ) self.db.executesql( 'CREATE VIRTUAL TABLE corpus_search USING fts3 (doc_content TEXT);' ) self.db.executesql( 'INSERT INTO corpus_search (rowid, doc_content) SELECT doc_index, doc_content FROM corpus;' ) ################################################################################ def ImportFromFile(self, filename): """ filename = A plain-text file (utf-8 encoded) containing one document per line """ def ReadFile(): with open(filename, 'r') as f: for index, line in enumerate(f): doc_index = index values = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') if len(values) == 1: doc_id = 'doc{}'.format(doc_index+1) doc_content = values[0] else: doc_id = values[0] doc_content = values[1] yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8') } self.db.corpus.bulk_insert(ReadFile()) def ImportFromFolder(self, glob_pattern): """ glob_pattern = A folder of files or a glob pattern for list of files (utf-8 encoded, one document per file) """ def ReadFolder(): filenames = sorted(glob.glob(glob_pattern)) for index, filename in enumerate(filenames): doc_index = index doc_id = filename with open(filename, 'r') as f: doc_content = f.read().decode('utf-8', 'ignore') yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8') } self.db.corpus.bulk_insert(ReadFolder()) def ImportFromSpreadsheet(self, filename, id_key = None, content_key = None, is_csv = False): """ filename = A tab- or comman-separated spreadsheet (utf-8 encoded, with a header row containing column names) id_key = Name of the column containing unique document IDs content_key = Name of the column containing the document contents """ doc_id_keys = frozenset([id_key] if id_key is not None else Corpus_DB.DOC_IDS) doc_content_keys = frozenset([content_key] if content_key is not None else Corpus_DB.DOC_CONTENTS) field_indexes = [] field_names = [] field_types = [] metadata = [] def ReadCSV(): with open(filename, 'r') as f: reader = UnicodeReader(f) for row in reader: yield row def ReadTSV(): with open(filename, 'r') as f: for line in f: yield line.decode('utf-8', 'ignore').rstrip('\n').split('\t') def ReadSpreadsheet(reader): field_doc_id = None field_doc_content = None for row_index, values in enumerate(reader): if row_index == 0: for index, field in enumerate(values): if field.lower() in doc_id_keys: field_doc_id = index elif field.lower() in doc_content_keys: field_doc_content = index else: field_index = len(field_indexes) field_indexes.append(field_index) field_names.append(field) field_types.append('integer') else: doc_index = row_index - 1 doc_id = 'doc{:d}'.format(doc_index+1) doc_content = '' field_index = 0 for index, value in enumerate(values): if field_doc_id == index: doc_id = value elif field_doc_content == index: doc_content = value else: metadata.append({ 'doc_index' : doc_index, 'field_index' : field_index, 'value' : value.encode('utf-8') }) # [START] infer field type field_type = field_types[field_index] if field_type == 'integer': try: int(value) except ValueError: field_type = 'double' if field_type == 'double': try: float(value) except ValueError: field_type = 'string' field_types[field_index] = field_type # [END] infer field type field_index += 1 yield { 'doc_index' : doc_index, 'doc_id' : doc_id, 'doc_content' : doc_content.encode('utf-8') } def GetFields(): for field_index in field_indexes: field_name = field_names[field_index] field_type = field_types[field_index] yield { 'field_index' : field_index, 'field_name' : field_name, 'field_type' : field_type } reader = ReadCSV() if is_csv else ReadTSV() self.db.corpus.bulk_insert(ReadSpreadsheet(reader)) self.db.fields.bulk_insert(GetFields()) self.db.metadata.bulk_insert(metadata) def ExportToFile(self, filename): """ filename = A tab-delimited file (utf-8 encoded, without header) containing docIDs and document contents """ def WriteFile(rows): m = re.compile(r'\s+') with open(filename, 'w') as f: for row in rows: doc_id = row.doc_id doc_content = m.sub(u' ', row.doc_content.decode('utf-8')) f.write(u'{}\t{}\n'.format(doc_id, doc_content).encode('utf-8')) rows = self.db().select(self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index) WriteFile(rows) def ExportToSpreadsheet(self, filename, is_csv = False): """ filename = A tab- or comma-separated spreadsheet (utf-8 encoded, with header) containing the text corpus and all metadata """ field_names = [ row.field_name for row in self.db().select(self.db.fields.field_name, orderby = self.db.fields.field_index) ] field_count = len(field_names) all_field_names = [ 'doc_id', 'doc_content' ] + field_names def WriteCSV(rows): m = re.compile(r'\s+') with open(filename, 'w') as f: writer = UnicodeWriter(f) writer.writerow(all_field_names) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = m.sub(u' ', row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = m.sub(u' ', d.value.decode('utf-8')) all_values = [ doc_id, doc_content ] + values writer.writerow(all_values) def WriteTSV(rows): m = re.compile(r'\s+') with open(filename, 'w') as f: f.write(u'{}\n'.format(u'\t'.join(all_field_names)).encode('utf-8')) for row in rows: doc_index = row.doc_index doc_id = row.doc_id doc_content = m.sub(u' ', row.doc_content.decode('utf-8')) values = [u''] * field_count for d in self.db(self.db.metadata.doc_index == doc_index).select(self.db.metadata.field_index, self.db.metadata.value): values[d.field_index] = m.sub(u' ', d.value.decode('utf-8')) all_values = [ doc_id, doc_content ] + values f.write(u'{}\n'.format(u'\t'.join(all_values)).encode('utf-8')) rows = self.db().select(self.db.corpus.doc_index, self.db.corpus.doc_id, self.db.corpus.doc_content, orderby = self.db.corpus.doc_index) if is_csv: WriteCSV(rows) else: WriteTSV(rows) ################################################################################ def DefineTermStatsTables(self): self.db.define_table( 'term_texts', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text' , 'string' , required = True, unique = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);' ) self.db.define_table( 'term_freqs', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);' ) self.db.define_table( 'term_probs', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);' ) self.db.define_table( 'term_doc_freqs', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);' ) def DefineTermCoStatsTables(self): self.db.define_table( 'term_co_freqs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);' ) self.db.define_table( 'term_co_probs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);' ) self.db.define_table( 'term_g2', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);' ) def DefineSentenceCoStatsTables(self): self.db.define_table( 'sentences_co_freqs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);' ) self.db.define_table( 'sentences_co_probs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);' ) self.db.define_table( 'sentences_g2', Field( 'first_term_index', 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);' ) ################################################################################ def Reset(self): self.db.executesql( 'DELETE FROM term_texts;' ) self.db.executesql( 'DELETE FROM term_freqs;' ) self.db.executesql( 'DELETE FROM term_probs;' ) self.db.executesql( 'DELETE FROM term_doc_freqs;' ) self.db.executesql( 'DELETE FROM term_co_freqs;' ) self.db.executesql( 'DELETE FROM term_co_probs;' ) self.db.executesql( 'DELETE FROM term_g2;' ) self.db.executesql( 'DELETE FROM sentences_co_freqs;' ) self.db.executesql( 'DELETE FROM sentences_co_probs;' ) self.db.executesql( 'DELETE FROM sentences_g2;' ) ################################################################################ def DefineTemporaryTable(self): self.db.define_table( 'vocab', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text', 'string', required = True ), migrate = self.isInit ) self.db.define_table( 'vocab_text', Field( 'term_text', 'string', required = True, unique = True, default = -1 ), migrate = self.isInit )
class BOW_DB(): FILENAME = 'bow.db' CONNECTION = 'sqlite://{}'.format(FILENAME) DEFAULT_OPTIONS = { 'max_freq_count' : 4000, # Maximum number of terms to store 'max_co_freq_count' : 100000 # Maximum number of term pairs to store } def __init__(self, path = None, isInit = False): self.isInit = isInit if path is not None: self.db = DAL(BOW_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit, folder = path) else: self.db = DAL(BOW_DB.CONNECTION, lazy_tables = not self.isInit, migrate_enabled = self.isInit) def __enter__(self): self.DefineOptionsTable() self.DefineTermStatsTables() self.DefineTermCoStatsTables() self.DefineSentenceCoStatsTables() self.DefineTemporaryTable() return self def __exit__(self, type, value, traceback): self.db.commit() ################################################################################ def DefineOptionsTable(self): self.db.define_table( 'options', Field( 'key' , 'string', required = True, unique = True ), Field( 'value', 'string', required = True ), migrate = self.isInit ) for key, value in BOW_DB.DEFAULT_OPTIONS.iteritems(): self.SetOption( key, value, overwrite = self.isInit ) def SetOption(self, key, value, overwrite = True): where = self.db.options.key == key if self.db( where ).count() > 0: if overwrite: self.db( where ).update( value = value ) else: self.db.options.insert( key = key, value = value ) self.db.commit() def GetOption(self, key): where = self.db.options.key == key keyValue = self.db( where ).select( self.db.options.value ).first() if keyValue: return keyValue.value else: return None ################################################################################ def DefineTermStatsTables(self): self.db.define_table( 'term_texts', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text' , 'string' , required = True, unique = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_text_value ON term_texts (term_text);' ) self.db.define_table( 'term_freqs', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_value ON term_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_freqs_rank ON term_freqs (rank);' ) self.db.define_table( 'term_probs', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_value ON term_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_probs_rank ON term_probs (rank);' ) self.db.define_table( 'term_doc_freqs', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_value ON term_doc_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_doc_freqs_rank ON term_doc_freqs (rank);' ) def DefineTermCoStatsTables(self): self.db.define_table( 'term_co_freqs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_freqs_indexes ON term_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_value ON term_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_freqs_rank ON term_co_freqs (rank);' ) self.db.define_table( 'term_co_probs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_co_probs_indexes ON term_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_value ON term_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_co_probs_rank ON term_co_probs (rank);' ) self.db.define_table( 'term_g2', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS term_g2_indexes ON term_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_value ON term_g2 (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS term_g2_rank ON term_g2 (rank);' ) def DefineSentenceCoStatsTables(self): self.db.define_table( 'sentences_co_freqs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_freqs_indexes ON sentences_co_freqs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_value ON sentences_co_freqs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_freqs_rank ON sentences_co_freqs (rank);' ) self.db.define_table( 'sentences_co_probs', Field( 'first_term_index' , 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_co_probs_indexes ON sentences_co_probs (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_value ON sentences_co_probs (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_co_probs_rank ON sentences_co_probs (rank);' ) self.db.define_table( 'sentences_g2', Field( 'first_term_index', 'integer', required = True, default = -1 ), Field( 'second_term_index', 'integer', required = True, default = -1 ), Field( 'value', 'double' , required = True ), Field( 'rank' , 'integer', required = True ), migrate = self.isInit ) if self.isInit: self.db.executesql( 'CREATE UNIQUE INDEX IF NOT EXISTS sentences_g2_indexes ON sentences_g2 (first_term_index, second_term_index);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_value ON sentences_g2 (value);' ) self.db.executesql( 'CREATE INDEX IF NOT EXISTS sentences_g2_rank ON sentences_g2 (rank);' ) ################################################################################ def Reset(self): self.db.executesql( 'DELETE FROM term_texts;' ) self.db.executesql( 'DELETE FROM term_freqs;' ) self.db.executesql( 'DELETE FROM term_probs;' ) self.db.executesql( 'DELETE FROM term_doc_freqs;' ) self.db.executesql( 'DELETE FROM term_co_freqs;' ) self.db.executesql( 'DELETE FROM term_co_probs;' ) self.db.executesql( 'DELETE FROM term_g2;' ) self.db.executesql( 'DELETE FROM sentences_co_freqs;' ) self.db.executesql( 'DELETE FROM sentences_co_probs;' ) self.db.executesql( 'DELETE FROM sentences_g2;' ) ################################################################################ def DefineTemporaryTable(self): self.db.define_table( 'vocab', Field( 'term_index', 'integer', required = True, unique = True, default = -1 ), Field( 'term_text', 'string', required = True ), migrate = self.isInit ) self.db.define_table( 'vocab_text', Field( 'term_text', 'string', required = True, unique = True, default = -1 ), migrate = self.isInit )