コード例 #1
0
ファイル: extractor.py プロジェクト: HelderSi/pypln
def scan_gridfs(db,host):
    """
    scans gridfs under a given database and returns
    a dictionary of files by mimetype
    """
    #TODO: maybe it's better to identify files by ID in both these scan functions.
    docdict = defaultdict(lambda:[])
    files = Connection('127.0.0.1')[db].fs.files
    fs = FS(db,True)
    cursor = files.find()
    for f in cursor:
        mt = mimetypes.guess_type(f)['filename']#classify documents by mimetype
        doc = fs.get(f['_id'])
        docdict[mt].append(doc.md5)
    return docdict
コード例 #2
0
def scan_gridfs(db, host):
    """
    scans gridfs under a given database and returns
    a dictionary of files by mimetype
    """
    #TODO: maybe it's better to identify files by ID in both these scan functions.
    docdict = defaultdict(lambda: [])
    files = Connection('127.0.0.1')[db].fs.files
    fs = FS(db, True)
    cursor = files.find()
    for f in cursor:
        mt = mimetypes.guess_type(f)[
            'filename']  #classify documents by mimetype
        doc = fs.get(f['_id'])
        docdict[mt].append(doc.md5)
    return docdict
コード例 #3
0
ファイル: extractor.py プロジェクト: HelderSi/pypln
def scan_dir(path, db, recurse=False):
    """
    Scans a directory, adds files to the GridFS and returns
    dictionary of files by mimetype
    """
    fs = FS(db,True)
    docdict = defaultdict(lambda:[])
    for p, dirs, files in os.walk(path):
        if not recurse:
            dirs = []
        for f in files:
            mt = mimetypes.guess_type(f)[0]
            #classify documents by mimetype
            try:
                fullpath = os.path.join(os.getcwd(),os.path.join(p, f).decode('utf8'))
            except UnicodeDecodeError:
                print "skipping: ",f
                continue
            fid = fs.add_file(fullpath)
            if fid != None:
                doc = fs.fs.get(fid)
                docdict[mt].append(doc.md5)
    return docdict
コード例 #4
0
def scan_dir(path, db, recurse=False):
    """
    Scans a directory, adds files to the GridFS and returns
    dictionary of files by mimetype
    """
    fs = FS(db, True)
    docdict = defaultdict(lambda: [])
    for p, dirs, files in os.walk(path):
        if not recurse:
            dirs = []
        for f in files:
            mt = mimetypes.guess_type(f)[0]
            #classify documents by mimetype
            try:
                fullpath = os.path.join(os.getcwd(),
                                        os.path.join(p, f).decode('utf8'))
            except UnicodeDecodeError:
                print "skipping: ", f
                continue
            fid = fs.add_file(fullpath)
            if fid != None:
                doc = fs.fs.get(fid)
                docdict[mt].append(doc.md5)
    return docdict