Ejemplo n.º 1
0
def getCommonDatesLocations(iterator,metadata,times,locations,dimensions,k):
    vNInd = '0'
    overallDateFormat = iterator.overallDateFormat if hasattr(iterator,'overallDateFormat') else ''
    dateFormat = iterator.dateFormat if hasattr(iterator,'dateFormat') else ''
    overallDate = iterator.overallDate if hasattr(iterator,'overallDate') else ''
    if overallDateFormat or dateFormat:
        DF = overallDateFormat + dateFormat
        F = td.mongotimeformatter(DF)
        T1 = [F(overallDate + x) for x in iterator.columnGroups['timeColNames'] if x in dimensions[k]]
        if overallDateFormat:
            reverseF = td.reverse(dateFormat)
            T2 = [F(overallDate + y) for y in map(reverseF,times[k])]
        else:
            T2 = times[k]
        mindate = min(T1 + T2)
        maxdate = max(T1 + T2)
        divisions = uniqify(ListUnion([td.getLowest(t) for t in T1 + T2]))
        metadata[k]['beginDate'] = mindate
        metadata[k]['endDate'] = maxdate
        metadata[k]['dateDivisions'] = divisions
    #locations
    if locations[k]:
        if hasattr(iterator,'overallLocation'):
            locs = [loc.integrate(iterator.overallLocation,l) for l in locations[k]]
        else:
            locs = locations[k]
        locs = locListUniqify(locs)
        metadata[k]['spatialDivisions'] = uniqify(ListUnion([loc.divisions(x) for x in locs]))
        metadata[k]['commonLocation'] = reduce(loc.intersect,locs)
Ejemplo n.º 2
0
def getSliceColTuples(collection):
    sliceColList = collection.sliceCols
    sliceColU = uniqify(ListUnion(sliceColList))
    sliceColInds = getStrs(collection,sliceColU)
    OK = dict([(x,x in collection.columnGroups.keys() or MoreThanOne(collection,y)) for (y,x) in zip(sliceColInds,sliceColU)])
    sliceColList = [tuple([x for x in sliceColU if x in sc and OK[x]]) for sc in sliceColList]
    sliceColTuples = uniqify(ListUnion([subTuples(sc) for sc in sliceColList]))
    
    return sliceColTuples
Ejemplo n.º 3
0
def queryToSolr(timeQuery):
    """converts timeQuery for use in find ie solr"""
    #TODO: query between march and december fails handle things that don't start with year
    F = mongotimeformatter(timeQuery['format'])
    for k in timeQuery.keys():
        if k != 'format':
            timeQuery[k] = F(timeQuery[k])
        
    if timeQuery.keys() == ['format']:
        divisions = [TIME_DIVISIONS[x] for x in uniqify(timeQuery['format'])]
        
        fq = 'dateDivisions:' + (divisions[0] if len(divisions) == 1 else '(' + ' AND '.join(divisions) + ')')
    
    else:
        if 'on' in timeQuery.keys():
            start = timeQuery['on']
            end = timeQuery['on']
        else:
            start = timeQuery['start'] if 'start' in timeQuery.keys() else None
            end = timeQuery['end'] if 'end' in timeQuery.keys() else None
            
        start = convertToSolrDT(start,convertMode='High') if start else None
        end = convertToSolrDT(end) if end else None

        fq = []
        if start:
            fq.append('beginDate:[* TO ' + start + ']')
        if end:
            fq.append('endDate:[' + end + ' TO *]')
            
    return fq
Ejemplo n.º 4
0
def regionsGuts(g,level_code):
	
	D  = {'method':'filter'}
	if 'return' in g:
		D['return'] = g['return']
	D['field'] = 'geom'
	
	if 'bounds' in g:
		w,s,e,n = g['bounds'].split(',')
		if 'type' in g:
			D['type'] = g['type']
		else:
			D['type'] = 'intersects'
		
		D['query'] = 'POLYGON((' + ', '.join([w + ' ' + n , w + ' ' + s, e + ' ' + s , e + ' ' + n, w + ' ' + n]) + ' ))'
		
	elif 'radius' in g and 'center' in g:
		D['radius'] = g['radius']
		x,y = g['center'].split(',')
		D['type'] = 'distance_lte'
		D['query'] = 'POINT(' + x + ' ' + y + ')'
		if 'units' in g:
			D['units'] = str(g['units'])
	
	R = uniqify([tuple(x.items()) for x in geodbGuts(D,level_code)])
	R = [dict([(k,v) for (k,v) in r if v != 'None']) for r in R]
	
	return R
Ejemplo n.º 5
0
def locListUniqify(D):
    D = [tuple([(y,z) if y != 'f' else (y,tuple(z.items())) for (y,z) in x.items()]) for x in D]
    D = uniqify(D)
    D = [dict(d) for d in D]
    for d in D:
        if d.has_key('f'):
            d['f'] = dict(d['f'])
    return D
Ejemplo n.º 6
0
def sliceInsert(c,collection,sliceColTuples,VarMap,sliceDB,version):      
    
    dontcheck = []
    for sct in sliceColTuples:
        if all([VarMap[k] in c.keys() for k in sct]):
            slice = pm.son.SON([(k,c[VarMap[k]]) for k in sct if VarMap[k] in c.keys()])
            dc = sct in dontcheck
            if dc or not sliceDB.find_one({'slice':slice,'version':version}):
                if not dc:
                    SCT = set(sct)
                    dontcheck = uniqify(dontcheck + [ss for ss in sliceColTuples if SCT <= set(ss)])
                sliceDB.update({'slice':slice},{'$set':{'version':version,'original':version}},upsert=True)
Ejemplo n.º 7
0
def generateQueries(DateFormat,timeQuery):
    """Converts nice DateFormat string and simple query format for time and generate mongo icky
    DateFormat : String e.g. YYYYmmdd
    timeQuery : Dict keys = format, begin, end, on
    These are not necessary"""
    timeQueryFormat = timeQuery['format'] if 'format' in timeQuery.keys() else DateFormat
    tQFset = set(timeQueryFormat)
    tFset = set(DateFormat)

    
    if tQFset <= tFset:
        tQHier = getHierarchy(tQFset)
        Hier = getHierarchy(DateFormat)
        
        mergedTimeFormat = ''.join(tFset.difference(tQFset)) + timeQueryFormat
        timeFormatter = mongotimeformatter(mergedTimeFormat)
        zeroLen = len(tFset.difference(tQFset))
        
        tQHier0 = [x[0] for x in tQHier]
        Hier0 = [x[0] for x in Hier]
        basePathDict = dict([(m,getPathsTo(m,Hier)) for m in tQHier0])
        belowPathDict = dict([(m,getPathsBelow(m,Hier)) for m in tQHier0])
        
        Q = {}
        for (k,op) in [('begin','$gte'),('end','$lt')]:
            if k in timeQuery.keys():
                timeObj = timeFormatter('X'*zeroLen + timeQuery[k])
                for m in basePathDict.keys():
                    for p in basePathDict[m]:
                        if p in Q.keys():
                            Q[p][op] = rgetattr(timeObj,p)
                        else:
                            Q[p] = {op: rgetattr(timeObj,p)}
                                                
        if 'on' in timeQuery.keys():
            timeObj = timeFormatter('X'*zeroLen + timeQuery['on'])
            paths = uniqify(ListUnion([getPathsTo(m,Hier) for m in tQFset]))
            for p in paths:
                p = p + ('',)
                Q[p] = rgetattr(timeObj,p)
        
        if not set(timeQuery.keys()).intersection(['begin','end','on']):
            for m in set(tQHier0):
                for p in basePathDict[m]:
                    p = p + ('',)
                    Q[p] = {'$exists':True}
                for p in belowPathDict[m]:
                    p = p + ('',)
                    Q[p] = {'$exists':False}

        
        return Q
Ejemplo n.º 8
0
def generateQueries(spaceQuery):    
    """Generates queries for get in mongo"""
    Q = {}
    if isinstance(spaceQuery,list) or isinstance(spaceQuery,tuple):
        for x in uniqify(spaceQuery):
            Q[tuple(x.split('.'))] = {'$exists':True}
    
    elif hasattr(spaceQuery,'keys'):
        spaceQuery = convertSQ(spaceQuery)
        for x in spaceQuery.keys():
            Q[tuple(x.split('.'))] = spaceQuery[x]
            
    return Q
Ejemplo n.º 9
0
def SpaceComplete(x):
    """FIPS -> names and upwards when possible"""
    if 'f' in x.keys():
        x = x.copy()
        iFIPS = ListUnion([SPACE_HIERARCHY_R[c] for c in x['f'].keys()])
        iFIPS = [c for c in iFIPS if c not in x.keys()]
        Cset = [c + '=' + x['f'][c] for c in x['f'].keys() if uniqify(x['f'][c]) != ['0']]
        if iFIPS and Cset:
      
            X = eval(urllib2.urlopen('http://localhost:8000/geo/fips/?' + '&'.join(Cset)).read())
            if len(X) == 1:
                X = convertToCodes(X[0])
                x['f'] = X['f']
                for c in X.keys():
                    if c not in x.keys():
                        x[c] = X[c]
            
    return x
Ejemplo n.º 10
0
def getQueryList(collection,keys,atVersion,toVersion,slicesCorrespondToIndexes):
    totalVariables = collection.columns
    VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))]))
    origInd = VarMap['__originalVersion__'] ; retInd = VarMap['__retained__'] ; vNInd = VarMap['__versionNumber__']
    keys = [str(x) for x in keys]
    existence = [(k,{'$exists':True,'$ne':''}) for k in keys]
    if keys:
        Q1 = processArg(dict([(origInd,{'$gt':atVersion}),(vNInd,toVersion)] + existence),collection)
        Q2 = processArg(dict([(retInd,{'$exists':False}),(vNInd,{'$lt':toVersion,'$gte':atVersion})] + existence),collection)
        Q3 = processArg(dict([(retInd,True),(vNInd,{'$lt':toVersion,'$gte':atVersion}),(origInd,{'$lte':atVersion})] + existence),collection)
        colnames = [k for k in keys if k.split('.')[0] in collection.columns]
        colgroups = [k for k in keys if k in collection.columnGroups]
        T= ListUnion([collection.columnGroups[k] for k in colgroups])
        kInds = getStrs(collection,colnames + T)
        R = list(collection.find(Q1,fields = kInds)) + list(collection.find(Q2,fields = kInds)) + (list(collection.find(Q3,fields = kInds)) if not slicesCorrespondToIndexes else [])
        R = [son.SON([(collection.columns[int(k)],r[k]) for k in r.keys() if k.isdigit() and r[k]]) for r in R]
        R = [[(k,rgetattr(r,k.split('.'))) for k in keys if  rhasattr(r,k.split('.')) if k not in T] + [(g,[r[k] for k in collection.columnGroups[g] if k in r.keys() and r[k]]) for g in colgroups ] for r in R]
        return uniqify(ListUnion([expand(r) for r in R]))
    else:
        return [()]
Ejemplo n.º 11
0
def initialize_argdict(collection):

    d = {} ; ArgDict = {}
    
    sliceCols = uniqify(Flatten(collection.sliceCols))
    sliceColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in sliceCols])
    
    if hasattr(collection,'contentCols'):
        contentColList = ListUnion([[x] if x.split('.')[0] in collection.columns else collection.columnGroups.get(x,[]) for x in collection.contentCols])
        contentCols = uniqify(contentColList + sliceColList)
    else:
        contentCols = sliceColList
    contentColNums = getStrs(collection,contentCols)
    ArgDict['contentColNums'] = contentColNums
    
    if hasattr(collection,'dateFormat'):
        dateFormat = collection.dateFormat
        ArgDict['overallDateFormat'] = dateFormat
        timeFormatter = td.mongotimeformatter(dateFormat)
        ArgDict['timeFormatter'] = timeFormatter
    else:
        dateFormat = ''
        
    if hasattr(collection,'overallDate'):
        od = collection.overallDate['date']
        odf = collection.overallDate['format']
        ArgDict['overallDate'] = od
        overallDateFormat = odf + dateFormat
        ArgDict['overallDateFormat'] = overallDateFormat
        timeFormatter = td.mongotimeformatter(overallDateFormat)
        ArgDict['timeFormatter'] = timeFormatter

        OD = timeFormatter(overallDate +'X'*len(dateFormat))
        ArgDict['dateDivisions'] = td.getLowest(OD)
        ArgDict['datePhrases'] = [td.phrase(OD)]
        ArgDict['mindate'] = OD
        ArgDict['maxdate'] = OD             
        
        if dateFormat:
            reverseTimeFormatter = td.reverse(dateFormat)
            ArgDict['reverseTimeFormatter'] = reverseTimeFormatter
            
    else:
        od = ''
                    
    if 'timeColNames' in collection.columnGroups.keys():
        timeColNamesInd = getNums(collection,collection.columnGroups['timeColNames'])
        tcs = [timeFormatter(od + t) for t in collection.columnGroups['timeColNames']]
        ArgDict['timeColNames'] = tcs 
        ArgDict['timeColNameInds'] = timeColNamesInd
        ArgDict['timeColNameDivisions'] = [[td.TIME_DIVISIONS[x] for x in td.getLowest(tc)] for tc in tcs] 
        ArgDict['timeColNamePhrases'] = [td.phrase(t) for t in tcs]

    if 'timeColumns' in collection.columnGroups.keys():
        ArgDict['timeColInds'] = getNums(collection,collection.columnGroups['timeColumns'])
            
    #overall location
    if hasattr(collection,'overallLocation'):
        ol = collection.overallLocation
        ArgDict['overallLocation'] = ol
    else:
        ol = None
        
    #get divisions and phrases from OverallLocation and SpaceColNames
    if 'spaceColNames' in collection.columnGroups.keys():
        spaceColNames = collection.columnGroups['spaceColNames']
        ArgDict['spaceColNames'] = [loc.integrate(ol,x) for x in spaceColNames]

        
    if 'spaceColumns' in collection.columnGroups.keys():
        ArgDict['spaceColInds'] = getNums(collection,collection.columnGroups['spaceColumns'])

    Source = collection.source
    SourceNameDict = son.SON([(k,Source[k]['name'] if isinstance(Source[k],dict) else Source[k]) for k in Source.keys()])
    SourceAbbrevDict = dict([(k,Source[k]['shortName']) for k in Source.keys() if isinstance(Source[k],dict) and 'shortName' in Source[k].keys() ])
    d['sourceSpec'] = json.dumps(SourceNameDict,default=ju.default)
    d['agency'] = SourceNameDict['agency']
    d['subagency'] = SourceNameDict['subagency']
    d['dataset'] = SourceNameDict['dataset']
    for k in SourceNameDict.keys():
        d['source_' + str(k).lower()] = SourceNameDict[k]
    for k in SourceAbbrevDict.keys():
        d['source_' + str(k).lower() + '_acronym'] = SourceAbbrevDict[k]
    d['source'] = ' '.join(SourceNameDict.values() + SourceAbbrevDict.values())
        
    if 'subcollections' in collection.columns:
        ArgDict['subColInd'] = collection.columns.index('subcollections')
     
    value_processor_instructions = stringifyDictElements(collection.valueProcessors)
    vpcontext = commonjs.translatorContext(value_processor_instructions)
    ArgDict['valueProcessors'],ArgDict['valueProcessorsKey'] = get_processors(value_processor_instructions,collection, vpcontext ,commonjs.js_call)
    
                                    
    return d, ArgDict
Ejemplo n.º 12
0
def largeAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate , overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames, spaceColInds, subColInd, valueProcessors):

    print '0'
    exists = []
    check = range(len(collection.columns))
    while check:
        k = check.pop(0)
        rec = collection.find_one( dict(query.items() + [(str(k),{'$exists':True})]))
        if rec:
            rec.pop('_id')
            new = map(int,rec.keys()) 
            check = list(set(check).difference(new))
            check.sort()
            exists += [(pos,collection.columns[pos]) for pos in new]
    
    print '1'
    exists = [e for e in exists if e[1] not in SPECIAL_KEYS] 
    (colnums,colnames) = zip(*exists)
    
    d['columnNames'] = colnames
    d['dimension'] = len(d['columnNames'])
       
    if overallDateFormat:
        d['dateFormat'] = overallDateFormat
        
        if timeColInds:
            dateColVals = ListUnion([collection.find(query).distinct(str(t)) for t in timeColInds if t in colnums])
            if overallDate:
                dateColVals = [timeFormatter(overallDate + reverseTimeFormatter(time)) for time in dateColVals]
        
            dateDivisions += uniqify(ListUnion(map(td.getLowest,dateColVals)))
            datePhrases += uniqify(map(td.phrase, dateColVals))
            mindate = td.makemin(mindate,min(dateColVals),)
            maxdate = td.makemax(maxdate,max(dateColVals),)
      
  
        if timeColNameInds:
            K = [k for (k,j) in enumerate(timeColNameInds) if k in colnums]
            dateDivisions += uniqify(ListUnion([timeColNameDivisions[k] for k in K]))
            mindate = td.makemin(mindate,min([timeColNames[k] for k in K]),)
            maxdate = td.makemax(maxdate,max([timeColNames[k] for k in K]),)
            datePhrases += [timeColNamePhrases[k] for k in K]

        dateDivisions = uniqify(dateDivisions)
        datePhrases = uniqify(datePhrases)
        
        d['beginDate'] = td.convertToDT(mindate)
        d['endDate'] = td.convertToDT(maxdate,convertMode='High')
        d['dateDivisions'] = uniqify(dateDivisions)
        d['datePhrases'] = datePhrases
    
    print '2'    
    if spaceColInds:
        spaceColVals = ListUnion([collection.find(query).distinct(str(t)) for t in spaceColInds if t in colnums])
        spaceColVals = [loc.integrate(overallLocation,scv) for scv in spaceColVals]   
    else:
        spaceColVals = []
    spaceVals = spaceColNames + spaceColVals 
    if spaceVals:
        d['spatialDivisions'] = uniqify(ListUnion(map(loc.divisions,spaceVals)))
        d['spatialDivisionsTight'] = uniqify(ListUnion(map(loc.divisions2,spaceVals)))
        d['spatialPhrases'] = uniqify(map(loc.phrase,spaceVals))
        d['spatialPhrasesTight'] = uniqify(map(loc.phrase2,spaceVals))
    commonLocation = overallLocation
    for sv in spaceVals:
        commonLocation = loc.intersect(commonLocation,sv)
        if not commonLocation:
            break 
    if commonLocation:
        d['commonLocation'] = loc.phrase(commonLocation)
                
    print '3'    

    d['sliceContents'] = ' '.join(uniqify(ListUnion([translate_list(valueProcessors.get(x,None) ,map(decode_obj,collection.find(query).distinct(x))) for x in contentColNums])))

    return d
Ejemplo n.º 13
0
def smallAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate, overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames , spaceColInds ,subColInd, valueProcessors,slicecount):

    R = collection.find(query,timeout=False)
    colnames = []
    d['sliceContents'] = []
    Subcollections = []
    
    spaceVals = spaceColNames
    commonLocation = overallLocation    
    for sv in spaceColNames:
        commonLocation = loc.intersect(commonLocation,sv)
        if not commonLocation:
            break     
  
    for (i,r) in enumerate(R):
        d['sliceContents'].append(' '.join([translate(valueProcessors.get(x,None),decode_obj(rgetattr(r,x.split('.')))) if rhasattr(r,x.split('.')) else '' for x in contentColNums]))
                      
        colnames  = uniqify(colnames + r.keys())
        
        if subColInd:
            Subcollections += r[str(subColInd)]
                
        if timeColInds:
            for x in timeColInds:
                if str(x) in r.keys():
                    time = r[str(x)]
                    if overallDate:
                        time = timeFormatter(overallDate + reverseTimeFormatter(time))
                    dateDivisions += td.getLowest(time)
                    datePhrases.append(td.phrase(time))     
                    mindate = td.makemin(mindate,time)
                    maxdate = td.makemax(maxdate,time)
        if spaceColInds:
            for x in spaceColInds:
                if str(x) in r.keys():
                    location = loc.integrate(overallLocation,r[str(x)])
                    commonLocation = loc.intersect(commonLocation,r[str(x)]) if commonLocation != None else None
                    spaceVals.append(location)
    
    d['sliceContents'] = ' '.join(d['sliceContents'])
    Subcollections = uniqify(Subcollections)
    d['columnNames'] = [collection.columns[int(x)] for x in colnames if x.isdigit()]
    d['dimension'] = len(d['columnNames'])
    #time/date
        
    if overallDateFormat:
        d['dateFormat'] = overallDateFormat
        
        if 'timeColNames' in collection.columnGroups.keys():
            K = [k for (k,j) in enumerate(timeColNameInds) if str(j) in colnames]
            dateDivisions += uniqify(ListUnion([timeColNameDivisions[k] for k in K]))
            mindate = td.makemin(mindate,min([timeColNames[k] for k in K]))
            maxdate = td.makemax(maxdate,max([timeColNames[k] for k in K]))         
            datePhrases += uniqify([timeColNamePhrases[k] for k in K])
        
        d['beginDate'] = td.convertToDT(mindate)
        d['endDate'] = td.convertToDT(maxdate,convertMode='High')
        d['dateDivisions'] = uniqify(dateDivisions)
        d['datePhrases'] = datePhrases if d['volume'] < 10000 else uniqify(datePhrases)

    if spaceVals:
        d['spatialDivisions'] = uniqify(ListUnion(map(loc.divisions,spaceVals)))
        d['spatialDivisionsTight'] = uniqify(ListUnion(map(loc.divisions2,spaceVals)))
        d['spatialPhrases'] = uniqify(map(loc.phrase,spaceVals))
        d['spatialPhrasesTight'] = uniqify(map(loc.phrase2,spaceVals))
        
    return d
Ejemplo n.º 14
0
def addToIndex(q,d,collection,solr_interface,slicecount,contentColNums = None, timeColInds=None,timeColNames=None, timeColNameInds = None,timeColNameDivisions = None,timeColNamePhrases=None,overallDate = '', overallDateFormat = '', timeFormatter = None,reverseTimeFormatter = None,dateDivisions=None,datePhrases=None,mindate = None,maxdate = None,overallLocation = None, spaceColNames = None, spaceColInds = None,subColInd = None,Return=False,valueProcessors=None,valueProcessorsKey=None):


    q['__versionNumber__'] = collection.currentVersion
    query = processArg(q,collection)
    q.pop('__versionNumber__') 
    
    d['collectionName'] = collection.name
    
    d['query'] = json.dumps(q,default=ju.default)
    
    d['mongoID'] = mongoID(q,collection.name)
    
    d['mongoText'] = queryToText(q,valueProcessorsKey)    
    
    d['sliceValues'] = queryValues(q,valueProcessorsKey)
    
    d['sliceKeys'] = queryKeys(q,valueProcessorsKey)

    d['versionNumber'] = collection.currentVersion

    if dateDivisions == None:
        dateDivisions = []
    else:
        dateDivisions = dateDivisions[:]
    if datePhrases == None:
        datePhrases = []
    else:
        datePhrases = datePhrases[:]
    if spaceColNames == None:
        spaceColNames = []
    
        
    #stats
    d['volume'] = collection.find(query).count()
    
    contentColNums = [i for i in contentColNums if i not in query.keys()]
    
    if d['volume'] > 0:
        if d['volume'] < 5000:
            smallAdd(d,query,collection,contentColNums, timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate , overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames , spaceColInds ,subColInd, valueProcessors,slicecount)
        else:
            largeAdd(d,query,collection,contentColNums,  timeColInds ,timeColNames , timeColNameInds ,timeColNameDivisions ,timeColNamePhrases ,overallDate, overallDateFormat, timeFormatter ,reverseTimeFormatter ,dateDivisions ,datePhrases ,mindate ,maxdate ,overallLocation , spaceColNames , spaceColInds ,subColInd, valueProcessors)
    
        Subcollections = uniqify(ListUnion(collection.find(query).distinct(str(subColInd))))
        metadata = collection.metadata['']
        for sc in Subcollections:
            metadata.update(collection.metadata.get(sc,{}))
        for k in metadata.keys():
            if k in STANDARD_META:
                if k in STANDARD_META_FORMATS.keys():
                    val = coerceToFormat(metadata[k],STANDARD_META_FORMATS[k])
                    if val:
                        d[str(k)] = val
                else:
                    d[str(k)] = str(metadata[k])
    
        if Return:
            return d
        else:
            solr_interface.add(**d)
Ejemplo n.º 15
0
def getSliceColTuples(sliceCols):
    sliceColList = sliceCols
    sliceColTuples = uniqify(ListUnion([subTuples(tuple(sc)) for sc in sliceColList]))
    return sliceColTuples
Ejemplo n.º 16
0
def updateCollection(download_dir,collectionName,parserClass,checkpath,certpath,parserArgs=None,parserKwargs=None,incremental=False):
    
    connection =  pm.Connection(document_class=pm.son.SON)
    
    source_metadata = get_source_data(collectionName)
    
    db = connection['govdata']
    assert not '__' in collectionName, 'collectionName must not contain consecutive underscores'
    metaCollectionName = '__' + collectionName + '__'
    versionName = '__' + collectionName + '__VERSIONS__'
    sliceDBName =  '__' + collectionName + '__SLICES__'
    
    collection = db[collectionName]
    metacollection = db[metaCollectionName]
    versions = db[versionName]     
    sliceDB = db[sliceDBName]
            
    if incremental:     
        if versionName not in db.collection_names():
            startInc = 0
        else:
            startInc = get_max_increment_fromDB(versions) + 1
        endInc = get_max_increment(download_dir)
        sources = [increment_format(download_dir,i) for i in range(startInc,endInc + 1)]
    else:
        sources = [download_dir]
        startInc = endInc = None
        
    if parserArgs == None:
        parserArgs = ()
    if parserKwargs == None:
        parserKwargs = {}
        
    if sources:
        iterator = parserClass(sources[0],*parserArgs,**parserKwargs)
        iterator.set_source_metadata(source_metadata)
    
        uniqueIndexes = iterator.uniqueIndexes
        ColumnGroups = iterator.columnGroups
        
        sliceColTuples = getSliceColTuples(iterator.sliceCols)
        sliceColTuplesFlat = uniqify([tuple(sorted(uniqify(Flatten(sct)))) for sct in sliceColTuples])
      
        sliceColList = uniqify(Flatten(ListUnion(sliceColTuples)))
        ContentCols = set(sliceColList + getContentCols(iterator))
            
        if hasattr(iterator,'dateFormat'):
            TimeFormatter = td.mongotimeformatter(iterator.dateFormat)
            
    
        if collectionName in db.collection_names():
            versionNumber = max(versions.distinct('versionNumber')) + 1
            storedAllMetadata = metacollection.find_one({'name':'','versionNumber':versionNumber-1})
            totalVariables = storedAllMetadata['columns']
            VarMap = dict(zip(totalVariables,[str(x) for x in range(len(totalVariables))]))   
            
            #check things are the same 
            #and check consistent  do so for all soruces
            
        else:
            versionNumber = 0
            IndexCols = uniqify([x for x in ['subcollections'] + sliceColList + ListUnion([ColGroupsFlatten(ColumnGroups,k) for k in ['indexColumns','labelColumns','timeColumns','spaceColumns']]) if x not in uniqueIndexes])
            
            totalVariables = SPECIAL_KEYS + uniqueIndexes + IndexCols
            
            assert not any(['.' in x or ('__' in x and x not in SPECIAL_KEYS) or x in ColumnGroups.keys() for x in totalVariables])
            
            VarMap = dict(zip(totalVariables,map(str,range(len(totalVariables)))))  
            
            cols = zip([VarMap[c] for c in uniqueIndexes + ['__versionNumber__']],[pm.DESCENDING]*(len(uniqueIndexes) + 1))
            collection.ensure_index(cols,unique=True,dropDups=True)
    
            for col in IndexCols:
                collection.ensure_index(VarMap[col])
            
            sliceDB.ensure_index('slice',unique=True,dropDups=True)
                            
        vNInd = VarMap['__versionNumber__']
        retInd = VarMap['__retained__']
        
        specialKeyInds = [VarMap[k] for k in SPECIAL_KEYS]
    
        if 'timeColumns' in iterator.columnGroups.keys():
            tcs = iterator.columnGroups['timeColumns']
        else:
            tcs = []
        
        if 'spaceColumns' in iterator.columnGroups.keys():
            spcs = iterator.columnGroups['spaceColumns']
        else:
            spcs = []
                  
        toParse = ListUnion([RecursiveFileList(source + '__PARSE__') for source in sources])
            
        oldc = None
        SpaceCache = {}    
        volumes = {'':0} 
        dimensions = {'':[]}
        times = {'':[]}
        locations = {'':[]}
        varFormats = {}
        for file in toParse:
            iterator.refresh(file)
            checkMetadata(iterator)
            tcs = iterator.columnGroups.get('timeColumns',[])
            spcs = iterator.columnGroups.get('spaceColumns',[])
            index = 0
            for c in iterator: 
                newVars = [x for x in c.keys() if not x in totalVariables]
                assert not any (['__' in x or '.' in x or x in ColumnGroups.keys() for x in newVars]) , '__ and . must not appear in key names.'     
                totalVariables += newVars
                VarMap.update(dict(zip(newVars,map(str,range(len(totalVariables) - len(newVars),len(totalVariables))))))
                
                for tc in tcs:   #time handling 
                    if tc in c.keys():
                        c[tc] = TimeFormatter(c[tc])
                if COMPLETE_SPACE:        
                    for spc in spcs:
                        if spc in c.keys():   #space
                            t = getT(c[spc])
                            if t in SpaceCache.keys():
                                c[spc] = SpaceCache[t].copy()
                            else:
                                c[spc] = loc.SpaceComplete(c[spc])
                                SpaceCache[t] = c[spc].copy()      
                if index % 100 == 0:
                    print 'At', index
                index += 1
                sctf = processSct(sliceColTuplesFlat,oldc,c)
                processRecord(c,collection,VarMap,totalVariables,uniqueIndexes,versionNumber,specialKeyInds,incremental,sliceDB,sctf,ContentCols)
                incrementThings(c,volumes,dimensions,times,locations,varFormats,tcs,spcs)
                
                oldc = c
                
        any_deleted = False                
        if incremental:
            collection.update({vNInd:{'$lte': versionNumber - 1}}, {'$set':{vNInd:versionNumber}})                    
            sliceDB.update({},{'$set':{'version':versionNumber}})  
   
        else:
            deleted = collection.find({vNInd:versionNumber - 1, retInd : {'$exists':False}})
            for d in deleted:
                any_deleted = True
                sliceDelete(d,collection,sliceColTuples,VarMap,sliceDB,version)
                                   
        if any_deleted:
            subColInd = str(totalVariables.index('Subcollections'))
            subcols = [''] + uniqify(ListUnion(collection.distinct(subColInd)))
            for sc in subcols:
                volumes[sc] = collection.find({subColInd:sc}).count()
                dimensions[sc] = [k for k in totalVariables if collection.find_one({subColInd:sc,str(totalVariables.index(k)):{'$exists':True}})]
                times[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in tcs])
                locations[sc] = ListUnion([collection.find({subColInd:sc}).distinct(t) for t in spcs])
                
               
        updateMetacollection(iterator,metacollection,incremental,versionNumber,totalVariables,tcs,spcs,volumes,dimensions,times,locations,varFormats)
        
        updateAssociatedFiles(sources,collection)
        
        updateVersionHistory(versionNumber,versions,startInc,endInc)
    
    updateSourceDBFromCollections(collectionNames = [collectionName])
    connection.disconnect()
    createCertificate(certpath,'Collection ' + collectionName + ' written to DB.')