def getCommonDatesLocations(iterator,metadata,times,locations,dimensions,k): vNInd = '0' overallDateFormat = iterator.overallDateFormat if hasattr(iterator,'overallDateFormat') else '' dateFormat = iterator.dateFormat if hasattr(iterator,'dateFormat') else '' overallDate = iterator.overallDate if hasattr(iterator,'overallDate') else '' if overallDateFormat or dateFormat: DF = overallDateFormat + dateFormat F = td.mongotimeformatter(DF) T1 = [F(overallDate + x) for x in iterator.columnGroups['timeColNames'] if x in dimensions[k]] if overallDateFormat: reverseF = td.reverse(dateFormat) T2 = [F(overallDate + y) for y in map(reverseF,times[k])] else: T2 = times[k] mindate = min(T1 + T2) maxdate = max(T1 + T2) divisions = uniqify(ListUnion([td.getLowest(t) for t in T1 + T2])) metadata[k]['beginDate'] = mindate metadata[k]['endDate'] = maxdate metadata[k]['dateDivisions'] = divisions #locations if locations[k]: if hasattr(iterator,'overallLocation'): locs = [loc.integrate(iterator.overallLocation,l) for l in locations[k]] else: locs = locations[k] locs = locListUniqify(locs) metadata[k]['spatialDivisions'] = uniqify(ListUnion([loc.divisions(x) for x in locs])) metadata[k]['commonLocation'] = reduce(loc.intersect,locs)
def getSliceColTuples(collection): sliceColList = collection.sliceCols sliceColU = uniqify(ListUnion(sliceColList)) sliceColInds = getStrs(collection,sliceColU) OK = dict([(x,x in collection.columnGroups.keys() or MoreThanOne(collection,y)) for (y,x) in zip(sliceColInds,sliceColU)]) sliceColList = [tuple([x for x in sliceColU if x in sc and OK[x]]) for sc in sliceColList] sliceColTuples = uniqify(ListUnion([subTuples(sc) for sc in sliceColList])) return sliceColTuples
def queryToSolr(timeQuery): """converts timeQuery for use in find ie solr""" #TODO: query between march and december fails handle things that don't start with year F = mongotimeformatter(timeQuery['format']) for k in timeQuery.keys(): if k != 'format': timeQuery[k] = F(timeQuery[k]) if timeQuery.keys() == ['format']: divisions = [TIME_DIVISIONS[x] for x in uniqify(timeQuery['format'])] fq = 'dateDivisions:' + (divisions[0] if len(divisions) == 1 else '(' + ' AND '.join(divisions) + ')') else: if 'on' in timeQuery.keys(): start = timeQuery['on'] end = timeQuery['on'] else: start = timeQuery['start'] if 'start' in timeQuery.keys() else None end = timeQuery['end'] if 'end' in timeQuery.keys() else None start = convertToSolrDT(start,convertMode='High') if start else None end = convertToSolrDT(end) if end else None fq = [] if start: fq.append('beginDate:[* TO ' + start + ']') if end: fq.append('endDate:[' + end + ' TO *]') return fq
def regionsGuts(g,level_code): D = {'method':'filter'} if 'return' in g: D['return'] = g['return'] D['field'] = 'geom' if 'bounds' in g: w,s,e,n = g['bounds'].split(',') if 'type' in g: D['type'] = g['type'] else: D['type'] = 'intersects' D['query'] = 'POLYGON((' + ', '.join([w + ' ' + n , w + ' ' + s, e + ' ' + s , e + ' ' + n, w + ' ' + n]) + ' ))' elif 'radius' in g and 'center' in g: D['radius'] = g['radius'] x,y = g['center'].split(',') D['type'] = 'distance_lte' D['query'] = 'POINT(' + x + ' ' + y + ')' if 'units' in g: D['units'] = str(g['units']) R = uniqify([tuple(x.items()) for x in geodbGuts(D,level_code)]) R = [dict([(k,v) for (k,v) in r if v != 'None']) for r in R] return R
def locListUniqify(D): D = [tuple([(y,z) if y != 'f' else (y,tuple(z.items())) for (y,z) in x.items()]) for x in D] D = uniqify(D) D = [dict(d) for d in D] for d in D: if d.has_key('f'): d['f'] = dict(d['f']) return D
def sliceInsert(c,collection,sliceColTuples,VarMap,sliceDB,version): dontcheck = [] for sct in sliceColTuples: if all([VarMap[k] in c.keys() for k in sct]): slice = pm.son.SON([(k,c[VarMap[k]]) for k in sct if VarMap[k] in c.keys()]) dc = sct in dontcheck if dc or not sliceDB.find_one({'slice':slice,'version':version}): if not dc: SCT = set(sct) dontcheck = uniqify(dontcheck + [ss for ss in sliceColTuples if SCT <= set(ss)]) sliceDB.update({'slice':slice},{'$set':{'version':version,'original':version}},upsert=True)
def generateQueries(DateFormat,timeQuery): """Converts nice DateFormat string and simple query format for time and generate mongo icky DateFormat : String e.g. YYYYmmdd timeQuery : Dict keys = format, begin, end, on These are not necessary""" timeQueryFormat = timeQuery['format'] if 'format' in timeQuery.keys() else DateFormat tQFset = set(timeQueryFormat) tFset = set(DateFormat) if tQFset <= tFset: tQHier = getHierarchy(tQFset) Hier = getHierarchy(DateFormat) mergedTimeFormat = ''.join(tFset.difference(tQFset)) + timeQueryFormat timeFormatter = mongotimeformatter(mergedTimeFormat) zeroLen = len(tFset.difference(tQFset)) tQHier0 = [x[0] for x in tQHier] Hier0 = [x[0] for x in Hier] basePathDict = dict([(m,getPathsTo(m,Hier)) for m in tQHier0]) belowPathDict = dict([(m,getPathsBelow(m,Hier)) for m in tQHier0]) Q = {} for (k,op) in [('begin','$gte'),('end','$lt')]: if k in timeQuery.keys(): timeObj = timeFormatter('X'*zeroLen + timeQuery[k]) for m in basePathDict.keys(): for p in basePathDict[m]: if p in Q.keys(): Q[p][op] = rgetattr(timeObj,p) else: Q[p] = {op: rgetattr(timeObj,p)} if 'on' in timeQuery.keys(): timeObj = timeFormatter('X'*zeroLen + timeQuery['on']) paths = uniqify(ListUnion([getPathsTo(m,Hier) for m in tQFset])) for p in paths: p = p + ('',) Q[p] = rgetattr(timeObj,p) if not set(timeQuery.keys()).intersection(['begin','end','on']): for m in set(tQHier0): for p in basePathDict[m]: p = p + ('',) Q[p] = {'$exists':True} for p in belowPathDict[m]: p = p + ('',) Q[p] = {'$exists':False} return Q
def generateQueries(spaceQuery): """Generates queries for get in mongo""" Q = {} if isinstance(spaceQuery,list) or isinstance(spaceQuery,tuple): for x in uniqify(spaceQuery): Q[tuple(x.split('.'))] = {'$exists':True} elif hasattr(spaceQuery,'keys'): spaceQuery = convertSQ(spaceQuery) for x in spaceQuery.keys(): Q[tuple(x.split('.'))] = spaceQuery[x] return Q
def SpaceComplete(x): """FIPS -> names and upwards when possible""" if 'f' in x.keys(): x = x.copy() iFIPS = ListUnion([SPACE_HIERARCHY_R[c] for c in x['f'].keys()]) iFIPS = [c for c in iFIPS if c not in x.keys()] Cset = [c + '=' + x['f'][c] for c in x['f'].keys() if uniqify(x['f'][c]) != ['0']] if iFIPS and Cset: X = eval(urllib2.urlopen('http://localhost:8000/geo/fips/?' + '&'.join(Cset)).read()) if len(X) == 1: X = convertToCodes(X[0]) x['f'] = X['f'] for c in X.keys(): if c not in x.keys(): x[c] = X[c] return x
def getQueryList(collection,keys,atVersion,toVersion,slicesCorrespondToIndexes): totalVariables = collection.columns VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))])) origInd = VarMap['__originalVersion__'] ; retInd = VarMap['__retained__'] ; vNInd = VarMap['__versionNumber__'] keys = [str(x) for x in keys] existence = [(k,{'$exists':True,'$ne':''}) for k in keys] if keys: Q1 = processArg(dict([(origInd,{'$gt':atVersion}),(vNInd,toVersion)] + existence),collection) Q2 = processArg(dict([(retInd,{'$exists':False}),(vNInd,{'$lt':toVersion,'$gte':atVersion})] + existence),collection) Q3 = processArg(dict([(retInd,True),(vNInd,{'$lt':toVersion,'$gte':atVersion}),(origInd,{'$lte':atVersion})] + existence),collection) colnames = [k for k in keys if k.split('.')[0] in collection.columns] colgroups = [k for k in keys if k in collection.columnGroups] T= ListUnion([collection.columnGroups[k] for k in colgroups]) kInds = getStrs(collection,colnames + T) R = list(collection.find(Q1,fields = kInds)) + list(collection.find(Q2,fields = kInds)) + (list(collection.find(Q3,fields = kInds)) if not slicesCorrespondToIndexes else []) R = [son.SON([(collection.columns[int(k)],r[k]) for k in r.keys() if k.isdigit() and r[k]]) for r in R] R = [[(k,rgetattr(r,k.split('.'))) for k in keys if rhasattr(r,k.split('.')) if k not in T] + [(g,[r[k] for k in collection.columnGroups[g] if k in r.keys() and r[k]]) for g in colgroups ] for r in R] return uniqify(ListUnion([expand(r) for r in R])) else: return [()]
def initialize_argdict(collection): d = {} ; ArgDict = {} sliceCols = uniqify(Flatten(collection.sliceCols)) sliceColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in sliceCols]) if hasattr(collection,'contentCols'): contentColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in collection.contentCols]) contentCols = uniqify(contentColList + sliceColList) else: contentCols = sliceColList contentColNums = getStrs(collection,contentCols) ArgDict['contentColNums'] = contentColNums if hasattr(collection,'dateFormat'): dateFormat = collection.dateFormat ArgDict['overallDateFormat'] = dateFormat timeFormatter = td.mongotimeformatter(dateFormat) ArgDict['timeFormatter'] = timeFormatter else: dateFormat = '' if hasattr(collection,'overallDate'): od = collection.overallDate['date'] odf = collection.overallDate['format'] ArgDict['overallDate'] = od overallDateFormat = odf + dateFormat ArgDict['overallDateFormat'] = overallDateFormat timeFormatter = td.mongotimeformatter(overallDateFormat) ArgDict['timeFormatter'] = timeFormatter OD = timeFormatter(overallDate +'X'*len(dateFormat)) ArgDict['dateDivisions'] = td.getLowest(OD) ArgDict['datePhrases'] = [td.phrase(OD)] ArgDict['mindate'] = OD ArgDict['maxdate'] = OD if dateFormat: reverseTimeFormatter = td.reverse(dateFormat) ArgDict['reverseTimeFormatter'] = reverseTimeFormatter else: od = '' if 'timeColNames' in collection.columnGroups.keys(): timeColNamesInd = getNums(collection,collection.columnGroups['timeColNames']) tcs = [timeFormatter(od + t) for t in collection.columnGroups['timeColNames']] ArgDict['timeColNames'] = tcs ArgDict['timeColNameInds'] = timeColNamesInd ArgDict['timeColNameDivisions'] = [[td.TIME_DIVISIONS[x] for x in td.getLowest(tc)] for tc in tcs] ArgDict['timeColNamePhrases'] = [td.phrase(t) for t in tcs] if 'timeColumns' in collection.columnGroups.keys(): ArgDict['timeColInds'] = getNums(collection,collection.columnGroups['timeColumns']) #overall location if hasattr(collection,'overallLocation'): ol = collection.overallLocation ArgDict['overallLocation'] = ol else: ol = None #get divisions and phrases from OverallLocation and SpaceColNames if 'spaceColNames' in collection.columnGroups.keys(): spaceColNames = collection.columnGroups['spaceColNames'] ArgDict['spaceColNames'] = [loc.integrate(ol,x) for x in spaceColNames] if 'spaceColumns' in collection.columnGroups.keys(): ArgDict['spaceColInds'] = getNums(collection,collection.columnGroups['spaceColumns']) Source = collection.source SourceNameDict = son.SON([(k,Source[k]['name'] if isinstance(Source[k],dict) else Source[k]) for k in Source.keys()]) SourceAbbrevDict = dict([(k,Source[k]['shortName']) for k in Source.keys() if isinstance(Source[k],dict) and 'shortName' in Source[k].keys() ]) d['sourceSpec'] = json.dumps(SourceNameDict,default=ju.default) d['agency'] = SourceNameDict['agency'] d['subagency'] = SourceNameDict['subagency'] d['dataset'] = SourceNameDict['dataset'] for k in SourceNameDict.keys(): d['source_' + str(k).lower()] = SourceNameDict[k] for k in SourceAbbrevDict.keys(): d['source_' + str(k).lower() + '_acronym'] = SourceAbbrevDict[k] d['source'] = ' '.join(SourceNameDict.values() + SourceAbbrevDict.values()) if 'subcollections' in collection.columns: ArgDict['subColInd'] = collection.columns.index('subcollections') value_processor_instructions = stringifyDictElements(collection.valueProcessors) vpcontext = commonjs.translatorContext(value_processor_instructions) ArgDict['valueProcessors'],ArgDict['valueProcessorsKey'] = get_processors(value_processor_instructions,collection, vpcontext ,commonjs.js_call) return d, ArgDict
def largeAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate , overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames, spaceColInds, subColInd, valueProcessors): print '0' exists = [] check = range(len(collection.columns)) while check: k = check.pop(0) rec = collection.find_one( dict(query.items() + [(str(k),{'$exists':True})])) if rec: rec.pop('_id') new = map(int,rec.keys()) check = list(set(check).difference(new)) check.sort() exists += [(pos,collection.columns[pos]) for pos in new] print '1' exists = [e for e in exists if e[1] not in SPECIAL_KEYS] (colnums,colnames) = zip(*exists) d['columnNames'] = colnames d['dimension'] = len(d['columnNames']) if overallDateFormat: d['dateFormat'] = overallDateFormat if timeColInds: dateColVals = ListUnion([collection.find(query).distinct(str(t)) for t in timeColInds if t in colnums]) if overallDate: dateColVals = [timeFormatter(overallDate + reverseTimeFormatter(time)) for time in dateColVals] dateDivisions += uniqify(ListUnion(map(td.getLowest,dateColVals))) datePhrases += uniqify(map(td.phrase, dateColVals)) mindate = td.makemin(mindate,min(dateColVals),) maxdate = td.makemax(maxdate,max(dateColVals),) if timeColNameInds: K = [k for (k,j) in enumerate(timeColNameInds) if k in colnums] dateDivisions += uniqify(ListUnion([timeColNameDivisions[k] for k in K])) mindate = td.makemin(mindate,min([timeColNames[k] for k in K]),) maxdate = td.makemax(maxdate,max([timeColNames[k] for k in K]),) datePhrases += [timeColNamePhrases[k] for k in K] dateDivisions = uniqify(dateDivisions) datePhrases = uniqify(datePhrases) d['beginDate'] = td.convertToDT(mindate) d['endDate'] = td.convertToDT(maxdate,convertMode='High') d['dateDivisions'] = uniqify(dateDivisions) d['datePhrases'] = datePhrases print '2' if spaceColInds: spaceColVals = ListUnion([collection.find(query).distinct(str(t)) for t in spaceColInds if t in colnums]) spaceColVals = [loc.integrate(overallLocation,scv) for scv in spaceColVals] else: spaceColVals = [] spaceVals = spaceColNames + spaceColVals if spaceVals: d['spatialDivisions'] = uniqify(ListUnion(map(loc.divisions,spaceVals))) d['spatialDivisionsTight'] = uniqify(ListUnion(map(loc.divisions2,spaceVals))) d['spatialPhrases'] = uniqify(map(loc.phrase,spaceVals)) d['spatialPhrasesTight'] = uniqify(map(loc.phrase2,spaceVals)) commonLocation = overallLocation for sv in spaceVals: commonLocation = loc.intersect(commonLocation,sv) if not commonLocation: break if commonLocation: d['commonLocation'] = loc.phrase(commonLocation) print '3' d['sliceContents'] = ' '.join(uniqify(ListUnion([translate_list(valueProcessors.get(x,None) ,map(decode_obj,collection.find(query).distinct(x))) for x in contentColNums]))) return d
def smallAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate, overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames , spaceColInds ,subColInd, valueProcessors,slicecount): R = collection.find(query,timeout=False) colnames = [] d['sliceContents'] = [] Subcollections = [] spaceVals = spaceColNames commonLocation = overallLocation for sv in spaceColNames: commonLocation = loc.intersect(commonLocation,sv) if not commonLocation: break for (i,r) in enumerate(R): d['sliceContents'].append(' '.join([translate(valueProcessors.get(x,None),decode_obj(rgetattr(r,x.split('.')))) if rhasattr(r,x.split('.')) else '' for x in contentColNums])) colnames = uniqify(colnames + r.keys()) if subColInd: Subcollections += r[str(subColInd)] if timeColInds: for x in timeColInds: if str(x) in r.keys(): time = r[str(x)] if overallDate: time = timeFormatter(overallDate + reverseTimeFormatter(time)) dateDivisions += td.getLowest(time) datePhrases.append(td.phrase(time)) mindate = td.makemin(mindate,time) maxdate = td.makemax(maxdate,time) if spaceColInds: for x in spaceColInds: if str(x) in r.keys(): location = loc.integrate(overallLocation,r[str(x)]) commonLocation = loc.intersect(commonLocation,r[str(x)]) if commonLocation != None else None spaceVals.append(location) d['sliceContents'] = ' '.join(d['sliceContents']) Subcollections = uniqify(Subcollections) d['columnNames'] = [collection.columns[int(x)] for x in colnames if x.isdigit()] d['dimension'] = len(d['columnNames']) #time/date if overallDateFormat: d['dateFormat'] = overallDateFormat if 'timeColNames' in collection.columnGroups.keys(): K = [k for (k,j) in enumerate(timeColNameInds) if str(j) in colnames] dateDivisions += uniqify(ListUnion([timeColNameDivisions[k] for k in K])) mindate = td.makemin(mindate,min([timeColNames[k] for k in K])) maxdate = td.makemax(maxdate,max([timeColNames[k] for k in K])) datePhrases += uniqify([timeColNamePhrases[k] for k in K]) d['beginDate'] = td.convertToDT(mindate) d['endDate'] = td.convertToDT(maxdate,convertMode='High') d['dateDivisions'] = uniqify(dateDivisions) d['datePhrases'] = datePhrases if d['volume'] < 10000 else uniqify(datePhrases) if spaceVals: d['spatialDivisions'] = uniqify(ListUnion(map(loc.divisions,spaceVals))) d['spatialDivisionsTight'] = uniqify(ListUnion(map(loc.divisions2,spaceVals))) d['spatialPhrases'] = uniqify(map(loc.phrase,spaceVals)) d['spatialPhrasesTight'] = uniqify(map(loc.phrase2,spaceVals)) return d
def addToIndex(q,d,collection,solr_interface,slicecount,contentColNums = None, timeColInds=None,timeColNames=None, timeColNameInds = None,timeColNameDivisions = None,timeColNamePhrases=None,overallDate = '', overallDateFormat = '', timeFormatter = None,reverseTimeFormatter = None,dateDivisions=None,datePhrases=None,mindate = None,maxdate = None,overallLocation = None, spaceColNames = None, spaceColInds = None,subColInd = None,Return=False,valueProcessors=None,valueProcessorsKey=None): q['__versionNumber__'] = collection.currentVersion query = processArg(q,collection) q.pop('__versionNumber__') d['collectionName'] = collection.name d['query'] = json.dumps(q,default=ju.default) d['mongoID'] = mongoID(q,collection.name) d['mongoText'] = queryToText(q,valueProcessorsKey) d['sliceValues'] = queryValues(q,valueProcessorsKey) d['sliceKeys'] = queryKeys(q,valueProcessorsKey) d['versionNumber'] = collection.currentVersion if dateDivisions == None: dateDivisions = [] else: dateDivisions = dateDivisions[:] if datePhrases == None: datePhrases = [] else: datePhrases = datePhrases[:] if spaceColNames == None: spaceColNames = [] #stats d['volume'] = collection.find(query).count() contentColNums = [i for i in contentColNums if i not in query.keys()] if d['volume'] > 0: if d['volume'] < 5000: smallAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate , overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames , spaceColInds ,subColInd, valueProcessors,slicecount) else: largeAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate, overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames , spaceColInds ,subColInd, valueProcessors) Subcollections = uniqify(ListUnion(collection.find(query).distinct(str(subColInd)))) metadata = collection.metadata[''] for sc in Subcollections: metadata.update(collection.metadata.get(sc,{})) for k in metadata.keys(): if k in STANDARD_META: if k in STANDARD_META_FORMATS.keys(): val = coerceToFormat(metadata[k],STANDARD_META_FORMATS[k]) if val: d[str(k)] = val else: d[str(k)] = str(metadata[k]) if Return: return d else: solr_interface.add(**d)
def getSliceColTuples(sliceCols): sliceColList = sliceCols sliceColTuples = uniqify(ListUnion([subTuples(tuple(sc)) for sc in sliceColList])) return sliceColTuples
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False): connection = pm.Connection(document_class=pm.son.SON) source_metadata = get_source_data(collectionName) db = connection['govdata'] assert not '__' in collectionName, 'collectionName must not contain consecutive underscores' metaCollectionName = '__' + collectionName + '__' versionName = '__' + collectionName + '__VERSIONS__' sliceDBName = '__' + collectionName + '__SLICES__' collection = db[collectionName] metacollection = db[metaCollectionName] versions = db[versionName] sliceDB = db[sliceDBName] if incremental: if versionName not in db.collection_names(): startInc = 0 else: startInc = get_max_increment_fromDB(versions) + 1 endInc = get_max_increment(download_dir) sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)] else: sources = [download_dir] startInc = endInc = None if parserArgs == None: parserArgs = () if parserKwargs == None: parserKwargs = {} if sources: iterator = parserClass(sources[0],*parserArgs,**parserKwargs) iterator.set_source_metadata(source_metadata) uniqueIndexes = iterator.uniqueIndexes ColumnGroups = iterator.columnGroups sliceColTuples = getSliceColTuples(iterator.sliceCols) sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples]) sliceColList = uniqify(Flatten(ListUnion(sliceColTuples))) ContentCols = set(sliceColList + getContentCols(iterator)) if hasattr(iterator,'dateFormat'): TimeFormatter = td.mongotimeformatter(iterator.dateFormat) if collectionName in db.collection_names(): versionNumber = max(versions.distinct('versionNumber')) + 1 storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1}) totalVariables = storedAllMetadata['columns'] VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))])) #check things are the same #and check consistent do so for all soruces else: versionNumber = 0 IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes]) totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables]) VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables))))) cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1)) collection.ensure_index(cols,unique=True,dropDups=True) for col in IndexCols: collection.ensure_index(VarMap[col]) sliceDB.ensure_index('slice',unique=True,dropDups=True) vNInd = VarMap['__versionNumber__'] retInd = VarMap['__retained__'] specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS] if 'timeColumns' in iterator.columnGroups.keys(): tcs = iterator.columnGroups['timeColumns'] else: tcs = [] if 'spaceColumns' in iterator.columnGroups.keys(): spcs = iterator.columnGroups['spaceColumns'] else: spcs = [] toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources]) oldc = None SpaceCache = {} volumes = {'':0} dimensions = {'':[]} times = {'':[]} locations = {'':[]} varFormats = {} for file in toParse: iterator.refresh(file) checkMetadata(iterator) tcs = iterator.columnGroups.get('timeColumns',[]) spcs = iterator.columnGroups.get('spaceColumns',[]) index = 0 for c in iterator: newVars = [x for x in c.keys() if not x in totalVariables] assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.' totalVariables += newVars VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables)))))) for tc in tcs: #time handling if tc in c.keys(): c[tc] = TimeFormatter(c[tc]) if COMPLETE_SPACE: for spc in spcs: if spc in c.keys(): #space t = getT(c[spc]) if t in SpaceCache.keys(): c[spc] = SpaceCache[t].copy() else: c[spc] = loc.SpaceComplete(c[spc]) SpaceCache[t] = c[spc].copy() if index % 100 == 0: print 'At', index index += 1 sctf = processSct(sliceColTuplesFlat,oldc,c) processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols) incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs) oldc = c any_deleted = False if incremental: collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}}) sliceDB.update({},{'$set':{'version':versionNumber}}) else: deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}}) for d in deleted: any_deleted = True sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version) if any_deleted: subColInd = str(totalVariables.index('Subcollections')) subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd))) for sc in subcols: volumes[sc] = collection.find({subColInd:sc}).count() dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})] times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs]) locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs]) updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats) updateAssociatedFiles(sources,collection) updateVersionHistory(versionNumber,versions,startInc,endInc) updateSourceDBFromCollections(collectionNames = [collectionName]) connection.disconnect() createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')