def updateCollectionIndex(collectionName,incertpath,certpath, verbose=False): """incrementally update the query database. For a given collection with name NAME, this creates (or updates) the associated collection __NAME__SLICES__ Documents in this slice collection has the following keys: 'q' = list of pymongo simple queries (with keys in collection.sliceCols) corresponding to an identical slice 'h' = hash of concentenated record ID lists of all records in the slice 'v' = versionNumber of the document 'o' = originalVersion where this slice appears 'd' = whether this slice is deleted after this version The incrementation logic is: Add "new" queries -- as measured by having __originalIndex__ > atVersion, where atVersion = maximum existing version of a slice (fast to evaluate) -- and set the resulting records to have 'v' key and 'o' = currentNumber Set delete keys on all deleted queries, where "deleted" is measured by: -- being in version < current, and NOT having a __retained__ key -- not actually existing in the current version (this needs to be checked to handle the situation of something that has been deleted and then re-added) """ S = ourSolr.query('collectionName:' + collectionName,fl = 'versionNumber',sort='versionNumber desc',wt = 'json') existing_slice = ast.literal_eval(S)['response']['docs'] if len(existing_slice) > 0: atVersion = existing_slice[0]['versionNumber'][0] else: atVersion = -1 collection = Collection(collectionName,attachMetadata=True) currentVersion = collection.currentVersion sliceDB = collection.slices slicecount = sliceDB.find({'original':{'$gt':atVersion},'version':currentVersion}).count() block_size = 50000 MakeDir(certpath) if slicecount < block_size: add_slices(collection,collectionName,currentVersion,atVersion,0,None) else: try: import starflow.grid as grid except ImportError: add_slices(collection,collectionName,currentVerison,atVersion,0,None) else: num_blocks = int(math.ceil(float(slicecount)/block_size)) jobdescrs = [{'argstr': "import backend.indexing as I; I.add_slices(" + ", ".join([repr(x) for x in [collectionName, currentVersion,atVersion, block_size*i, block_size]]) + ")",'outfile': certpath + str(i),'name': 'Index' + collectionName + '_' + str(i)} for i in range(num_blocks)] retvals = grid.submitJobs(jobdescrs) delete_slices(sliceDB,currentVersion,atVersion) createCertificate(certpath + 'final.txt','Collection ' + collectionName + ' indexed.')
def download_check(download_dir, incremental, certpath): if not incremental: check_list = [download_dir ] else: check_list = get_increment_paths(download_dir) assert all(['__PARSE__' in listdir(p) for p in check_list]) createCertificate(certpath,'Collection properly downloaded and pre-parsed.')
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False): connection = pm.Connection(document_class=pm.son.SON) source_metadata = get_source_data(collectionName) db = connection['govdata'] assert not '__' in collectionName, 'collectionName must not contain consecutive underscores' metaCollectionName = '__' + collectionName + '__' versionName = '__' + collectionName + '__VERSIONS__' sliceDBName = '__' + collectionName + '__SLICES__' collection = db[collectionName] metacollection = db[metaCollectionName] versions = db[versionName] sliceDB = db[sliceDBName] if incremental: if versionName not in db.collection_names(): startInc = 0 else: startInc = get_max_increment_fromDB(versions) + 1 endInc = get_max_increment(download_dir) sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)] else: sources = [download_dir] startInc = endInc = None if parserArgs == None: parserArgs = () if parserKwargs == None: parserKwargs = {} if sources: iterator = parserClass(sources[0],*parserArgs,**parserKwargs) iterator.set_source_metadata(source_metadata) uniqueIndexes = iterator.uniqueIndexes ColumnGroups = iterator.columnGroups sliceColTuples = getSliceColTuples(iterator.sliceCols) sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples]) sliceColList = uniqify(Flatten(ListUnion(sliceColTuples))) ContentCols = set(sliceColList + getContentCols(iterator)) if hasattr(iterator,'dateFormat'): TimeFormatter = td.mongotimeformatter(iterator.dateFormat) if collectionName in db.collection_names(): versionNumber = max(versions.distinct('versionNumber')) + 1 storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1}) totalVariables = storedAllMetadata['columns'] VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))])) #check things are the same #and check consistent do so for all soruces else: versionNumber = 0 IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes]) totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables]) VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables))))) cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1)) collection.ensure_index(cols,unique=True,dropDups=True) for col in IndexCols: collection.ensure_index(VarMap[col]) sliceDB.ensure_index('slice',unique=True,dropDups=True) vNInd = VarMap['__versionNumber__'] retInd = VarMap['__retained__'] specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS] if 'timeColumns' in iterator.columnGroups.keys(): tcs = iterator.columnGroups['timeColumns'] else: tcs = [] if 'spaceColumns' in iterator.columnGroups.keys(): spcs = iterator.columnGroups['spaceColumns'] else: spcs = [] toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources]) oldc = None SpaceCache = {} volumes = {'':0} dimensions = {'':[]} times = {'':[]} locations = {'':[]} varFormats = {} for file in toParse: iterator.refresh(file) checkMetadata(iterator) tcs = iterator.columnGroups.get('timeColumns',[]) spcs = iterator.columnGroups.get('spaceColumns',[]) index = 0 for c in iterator: newVars = [x for x in c.keys() if not x in totalVariables] assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.' totalVariables += newVars VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables)))))) for tc in tcs: #time handling if tc in c.keys(): c[tc] = TimeFormatter(c[tc]) if COMPLETE_SPACE: for spc in spcs: if spc in c.keys(): #space t = getT(c[spc]) if t in SpaceCache.keys(): c[spc] = SpaceCache[t].copy() else: c[spc] = loc.SpaceComplete(c[spc]) SpaceCache[t] = c[spc].copy() if index % 100 == 0: print 'At', index index += 1 sctf = processSct(sliceColTuplesFlat,oldc,c) processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols) incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs) oldc = c any_deleted = False if incremental: collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}}) sliceDB.update({},{'$set':{'version':versionNumber}}) else: deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}}) for d in deleted: any_deleted = True sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version) if any_deleted: subColInd = str(totalVariables.index('Subcollections')) subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd))) for sc in subcols: volumes[sc] = collection.find({subColInd:sc}).count() dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})] times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs]) locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs]) updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats) updateAssociatedFiles(sources,collection) updateVersionHistory(versionNumber,versions,startInc,endInc) updateSourceDBFromCollections(collectionNames = [collectionName]) connection.disconnect() createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')