コード例 #1
0
def create_maindb(dbId, dbBaseDir):
    """Look for any subdirs (arbitrary level under dbBaseDir) that start with '#'.
    From these subdirs, try to load ./fulltext.txt and ./meta.xml files, create an article and insert it into database. 
    Store the database persistently as PyDbLite (pickle) file 'gensim_dbId.pdl'.
    """
    dbFname = common.dbFile('gensim', dbId)
    logging.info("opening database %s" % dbFname)
    db = ArticleDB.ArticleDB(dbFname, mode = 'override', autocommit = False)

    proc_total = 0
    logging.info("processing database %s, directory %s" % (dbId, dbBaseDir))
    for root, dirs, files in os.walk(dbBaseDir):
        root = os.path.normpath(root)
        if os.path.basename(root).startswith('#'):
            proc_total += 1
            try:
                meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml'))
                #meta = {'msc' : []}
                #meta['id_int'] = Article.idFromDir(root)
                meta['id_int'] = os.path.join(dbId, root[len(dbBaseDir) + 1: ])
                meta['body'] = unicode(open(os.path.join(root, 'fulltext.txt'), 'r').read(), 'utf8', 'ignore').encode('utf8')
                meta['references'] = None # TODO add
                art = Article.Article(record = meta)
                db.insertArticle(art)
            except Exception, e:
                logging.warning('invalid entries in %s; ignoring article (%s)' % (root, e))
                continue
コード例 #2
0
def createMscsDb():
    """Create MSC database of all languages."""
    db = ArticleDB.ArticleDB(ARTS_FILE, mode='override', autocommit=False)
    baseDir = ''

    proc_total = 0
    logging.info("processing directory %s" % common.inputPath(baseDir))
    for root, dirs, files in os.walk(common.inputPath(baseDir)):
        root = os.path.normpath(root)
        if os.path.basename(root).startswith('#'):
            proc_total += 1
            try:
                meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml'))
                try:
                    meta['body'] = open(os.path.join(root,
                                                     'fulltext.txt')).read()
                except Exception, e:
                    meta['body'] = None
                meta['id_int'] = root[len(common.INPUT_PATH) + 1:]
                meta['references'] = None  # TODO add
                art = Article.Article(record=meta)
                db.insertArticle(art)
            except Exception, e:
                logging.warning(
                    'invalid entries in %s; ignoring article (%s)' % (root, e))
                continue
コード例 #3
0
def create_maindb(dbId, dbBaseDir):
    """Look for any subdirs (arbitrary level under dbBaseDir) that start with '#'.
    From these subdirs, try to load ./fulltext.txt and ./meta.xml files, create an article and insert it into database. 
    Store the database persistently as PyDbLite (pickle) file 'gensim_dbId.pdl'.
    """
    dbFname = common.dbFile('gensim', dbId)
    logging.info("opening database %s" % dbFname)
    db = ArticleDB.ArticleDB(dbFname, mode='override', autocommit=False)

    proc_total = 0
    logging.info("processing database %s, directory %s" % (dbId, dbBaseDir))
    for root, dirs, files in os.walk(dbBaseDir):
        root = os.path.normpath(root)
        if os.path.basename(root).startswith('#'):
            proc_total += 1
            try:
                meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml'))
                #meta = {'msc' : []}
                #meta['id_int'] = Article.idFromDir(root)
                meta['id_int'] = os.path.join(dbId, root[len(dbBaseDir) + 1:])
                meta['body'] = unicode(
                    open(os.path.join(root, 'fulltext.txt'), 'r').read(),
                    'utf8', 'ignore').encode('utf8')
                meta['references'] = None  # TODO add
                art = Article.Article(record=meta)
                db.insertArticle(art)
            except Exception, e:
                logging.warning(
                    'invalid entries in %s; ignoring article (%s)' % (root, e))
                continue
コード例 #4
0
def createMscsDb():
    """Create MSC database of all languages."""
    db = ArticleDB.ArticleDB(ARTS_FILE, mode="override", autocommit=False)
    baseDir = ""

    proc_total = 0
    logging.info("processing directory %s" % common.inputPath(baseDir))
    for root, dirs, files in os.walk(common.inputPath(baseDir)):
        root = os.path.normpath(root)
        if os.path.basename(root).startswith("#"):
            proc_total += 1
            try:
                meta = utils_iddb.parseMeta(os.path.join(root, "meta.xml"))
                try:
                    meta["body"] = open(os.path.join(root, "fulltext.txt")).read()
                except Exception, e:
                    meta["body"] = None
                meta["id_int"] = root[len(common.INPUT_PATH) + 1 :]
                meta["references"] = None  # TODO add
                art = Article.Article(record=meta)
                db.insertArticle(art)
            except Exception, e:
                logging.warning("invalid entries in %s; ignoring article (%s)" % (root, e))
                continue