def create_maindb(dbId, dbBaseDir): """Look for any subdirs (arbitrary level under dbBaseDir) that start with '#'. From these subdirs, try to load ./fulltext.txt and ./meta.xml files, create an article and insert it into database. Store the database persistently as PyDbLite (pickle) file 'gensim_dbId.pdl'. """ dbFname = common.dbFile('gensim', dbId) logging.info("opening database %s" % dbFname) db = ArticleDB.ArticleDB(dbFname, mode = 'override', autocommit = False) proc_total = 0 logging.info("processing database %s, directory %s" % (dbId, dbBaseDir)) for root, dirs, files in os.walk(dbBaseDir): root = os.path.normpath(root) if os.path.basename(root).startswith('#'): proc_total += 1 try: meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml')) #meta = {'msc' : []} #meta['id_int'] = Article.idFromDir(root) meta['id_int'] = os.path.join(dbId, root[len(dbBaseDir) + 1: ]) meta['body'] = unicode(open(os.path.join(root, 'fulltext.txt'), 'r').read(), 'utf8', 'ignore').encode('utf8') meta['references'] = None # TODO add art = Article.Article(record = meta) db.insertArticle(art) except Exception, e: logging.warning('invalid entries in %s; ignoring article (%s)' % (root, e)) continue
def createMscsDb(): """Create MSC database of all languages.""" db = ArticleDB.ArticleDB(ARTS_FILE, mode='override', autocommit=False) baseDir = '' proc_total = 0 logging.info("processing directory %s" % common.inputPath(baseDir)) for root, dirs, files in os.walk(common.inputPath(baseDir)): root = os.path.normpath(root) if os.path.basename(root).startswith('#'): proc_total += 1 try: meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml')) try: meta['body'] = open(os.path.join(root, 'fulltext.txt')).read() except Exception, e: meta['body'] = None meta['id_int'] = root[len(common.INPUT_PATH) + 1:] meta['references'] = None # TODO add art = Article.Article(record=meta) db.insertArticle(art) except Exception, e: logging.warning( 'invalid entries in %s; ignoring article (%s)' % (root, e)) continue
def create_maindb(dbId, dbBaseDir): """Look for any subdirs (arbitrary level under dbBaseDir) that start with '#'. From these subdirs, try to load ./fulltext.txt and ./meta.xml files, create an article and insert it into database. Store the database persistently as PyDbLite (pickle) file 'gensim_dbId.pdl'. """ dbFname = common.dbFile('gensim', dbId) logging.info("opening database %s" % dbFname) db = ArticleDB.ArticleDB(dbFname, mode='override', autocommit=False) proc_total = 0 logging.info("processing database %s, directory %s" % (dbId, dbBaseDir)) for root, dirs, files in os.walk(dbBaseDir): root = os.path.normpath(root) if os.path.basename(root).startswith('#'): proc_total += 1 try: meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml')) #meta = {'msc' : []} #meta['id_int'] = Article.idFromDir(root) meta['id_int'] = os.path.join(dbId, root[len(dbBaseDir) + 1:]) meta['body'] = unicode( open(os.path.join(root, 'fulltext.txt'), 'r').read(), 'utf8', 'ignore').encode('utf8') meta['references'] = None # TODO add art = Article.Article(record=meta) db.insertArticle(art) except Exception, e: logging.warning( 'invalid entries in %s; ignoring article (%s)' % (root, e)) continue
def createMscsDb(): """Create MSC database of all languages.""" db = ArticleDB.ArticleDB(ARTS_FILE, mode="override", autocommit=False) baseDir = "" proc_total = 0 logging.info("processing directory %s" % common.inputPath(baseDir)) for root, dirs, files in os.walk(common.inputPath(baseDir)): root = os.path.normpath(root) if os.path.basename(root).startswith("#"): proc_total += 1 try: meta = utils_iddb.parseMeta(os.path.join(root, "meta.xml")) try: meta["body"] = open(os.path.join(root, "fulltext.txt")).read() except Exception, e: meta["body"] = None meta["id_int"] = root[len(common.INPUT_PATH) + 1 :] meta["references"] = None # TODO add art = Article.Article(record=meta) db.insertArticle(art) except Exception, e: logging.warning("invalid entries in %s; ignoring article (%s)" % (root, e)) continue