Exemple #1
0
def updateCollectionIndex(collectionName,incertpath,certpath, verbose=False):
    """incrementally update the query database.
    
        For a given collection with name NAME, this creates (or updates)  the associated collection __NAME__SLICES__
        
        Documents in this slice collection has the following keys:
            'q' = list of pymongo simple queries (with keys in collection.sliceCols) corresponding to an identical slice
            'h' = hash of concentenated record ID lists of all records in the slice
            'v' = versionNumber of the document
            'o' = originalVersion where this slice  appears
            'd' = whether this slice is deleted after this version
        
        
        The incrementation logic is:
            Add "new" queries -- as measured by having __originalIndex__ > atVersion, where atVersion = maximum existing version of a slice (fast to evaluate) 
                -- and set the resulting records to have 'v' key and 'o' = currentNumber
                
            

                
            Set delete keys on all deleted queries, where "deleted" is measured by:
                -- being in version < current, and NOT having a __retained__ key
                -- not actually existing in the current version (this needs to be checked to handle the situation of something that has been deleted and then re-added)

    """


    S = ourSolr.query('collectionName:' + collectionName,fl = 'versionNumber',sort='versionNumber desc',wt = 'json')
    existing_slice = ast.literal_eval(S)['response']['docs']
    
    if len(existing_slice) > 0:
        atVersion = existing_slice[0]['versionNumber'][0]
    else:
        atVersion = -1
         
    collection = Collection(collectionName,attachMetadata=True)
    currentVersion = collection.currentVersion
    sliceDB = collection.slices
    slicecount = sliceDB.find({'original':{'$gt':atVersion},'version':currentVersion}).count()
    block_size = 50000
    MakeDir(certpath)
  
    
    if slicecount < block_size:
        add_slices(collection,collectionName,currentVersion,atVersion,0,None)
    else:       
        try:
            import starflow.grid as grid
        except ImportError:
            add_slices(collection,collectionName,currentVerison,atVersion,0,None)
        else:
            num_blocks = int(math.ceil(float(slicecount)/block_size))
            jobdescrs = [{'argstr': "import backend.indexing as I; I.add_slices(" + ", ".join([repr(x) for x in [collectionName, currentVersion,atVersion, block_size*i, block_size]]) + ")",'outfile': certpath + str(i),'name': 'Index' + collectionName + '_' + str(i)} for i in range(num_blocks)]
            retvals = grid.submitJobs(jobdescrs)
                    
    delete_slices(sliceDB,currentVersion,atVersion)
    
    createCertificate(certpath + 'final.txt','Collection ' + collectionName + ' indexed.')     
Exemple #2
0
def download_check(download_dir, incremental, certpath):

    if not incremental: 
        check_list = [download_dir ]
    else:
        check_list = get_increment_paths(download_dir)
       

    assert all(['__PARSE__' in listdir(p) for p in check_list])
            
    createCertificate(certpath,'Collection properly downloaded and pre-parsed.')
Exemple #3
0
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False):
    
    connection =  pm.Connection(document_class=pm.son.SON)
    
    source_metadata = get_source_data(collectionName)
    
    db = connection['govdata']
    assert not '__' in collectionName, 'collectionName must not contain consecutive underscores'
    metaCollectionName = '__' + collectionName + '__'
    versionName = '__' + collectionName + '__VERSIONS__'
    sliceDBName =  '__' + collectionName + '__SLICES__'
    
    collection = db[collectionName]
    metacollection = db[metaCollectionName]
    versions = db[versionName]     
    sliceDB = db[sliceDBName]
            
    if incremental:     
        if versionName not in db.collection_names():
            startInc = 0
        else:
            startInc = get_max_increment_fromDB(versions) + 1
        endInc = get_max_increment(download_dir)
        sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)]
    else:
        sources = [download_dir]
        startInc = endInc = None
        
    if parserArgs == None:
        parserArgs = ()
    if parserKwargs == None:
        parserKwargs = {}
        
    if sources:
        iterator = parserClass(sources[0],*parserArgs,**parserKwargs)
        iterator.set_source_metadata(source_metadata)
    
        uniqueIndexes = iterator.uniqueIndexes
        ColumnGroups = iterator.columnGroups
        
        sliceColTuples = getSliceColTuples(iterator.sliceCols)
        sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples])
      
        sliceColList = uniqify(Flatten(ListUnion(sliceColTuples)))
        ContentCols = set(sliceColList + getContentCols(iterator))
            
        if hasattr(iterator,'dateFormat'):
            TimeFormatter = td.mongotimeformatter(iterator.dateFormat)
            
    
        if collectionName in db.collection_names():
            versionNumber = max(versions.distinct('versionNumber')) + 1
            storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1})
            totalVariables = storedAllMetadata['columns']
            VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))]))   
            
            #check things are the same 
            #and check consistent  do so for all soruces
            
        else:
            versionNumber = 0
            IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes])
            
            totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols
            
            assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables])
            
            VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables)))))  
            
            cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1))
            collection.ensure_index(cols,unique=True,dropDups=True)
    
            for col in IndexCols:
                collection.ensure_index(VarMap[col])
            
            sliceDB.ensure_index('slice',unique=True,dropDups=True)
                            
        vNInd = VarMap['__versionNumber__']
        retInd = VarMap['__retained__']
        
        specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS]
    
        if 'timeColumns' in iterator.columnGroups.keys():
            tcs = iterator.columnGroups['timeColumns']
        else:
            tcs = []
        
        if 'spaceColumns' in iterator.columnGroups.keys():
            spcs = iterator.columnGroups['spaceColumns']
        else:
            spcs = []
                  
        toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources])
            
        oldc = None
        SpaceCache = {}    
        volumes = {'':0} 
        dimensions = {'':[]}
        times = {'':[]}
        locations = {'':[]}
        varFormats = {}
        for file in toParse:
            iterator.refresh(file)
            checkMetadata(iterator)
            tcs = iterator.columnGroups.get('timeColumns',[])
            spcs = iterator.columnGroups.get('spaceColumns',[])
            index = 0
            for c in iterator: 
                newVars = [x for x in c.keys() if not x in totalVariables]
                assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.'     
                totalVariables += newVars
                VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables))))))
                
                for tc in tcs:   #time handling 
                    if tc in c.keys():
                        c[tc] = TimeFormatter(c[tc])
                if COMPLETE_SPACE:        
                    for spc in spcs:
                        if spc in c.keys():   #space
                            t = getT(c[spc])
                            if t in SpaceCache.keys():
                                c[spc] = SpaceCache[t].copy()
                            else:
                                c[spc] = loc.SpaceComplete(c[spc])
                                SpaceCache[t] = c[spc].copy()      
                if index % 100 == 0:
                    print 'At', index
                index += 1
                sctf = processSct(sliceColTuplesFlat,oldc,c)
                processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols)
                incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs)
                
                oldc = c
                
        any_deleted = False                
        if incremental:
            collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}})                    
            sliceDB.update({},{'$set':{'version':versionNumber}})  
   
        else:
            deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}})
            for d in deleted:
                any_deleted = True
                sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version)
                                   
        if any_deleted:
            subColInd = str(totalVariables.index('Subcollections'))
            subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd)))
            for sc in subcols:
                volumes[sc] = collection.find({subColInd:sc}).count()
                dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})]
                times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs])
                locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs])
                
               
        updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats)
        
        updateAssociatedFiles(sources,collection)
        
        updateVersionHistory(versionNumber,versions,startInc,endInc)
    
    updateSourceDBFromCollections(collectionNames = [collectionName])
    connection.disconnect()
    createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')