def run(options, inputData): import yaml with open(options.configFile) as yaml_file: config = yaml.safe_load(yaml_file) username = config['cassandra']['local']['username'] password = config['cassandra']['local']['password'] keyspace = config['cassandra']['local']['keyspace'] hostname = config['cassandra']['local']['hostname'] db = { 'username': username, 'password': password, 'keyspace': keyspace, 'hostname': hostname } # Get n lightcurves. Consider doing this in parallel for a proper test. # As an initial test, run it single threaded. # We have the inputData, get a random subset. subset = inputData if len(inputData) > int(options.number): subset = random.sample(inputData, int(options.number)) if int(options.nprocesses) > 1 and len(subset) > 1: # Do it in parallel! currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) nProcessors, listChunks = splitList(subset, bins=int(options.nprocesses), preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, worker, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) else: cluster = Cluster(db['hostname']) session = cluster.connect() session.row_factory = dict_factory session.set_keyspace(db['keyspace']) lightcurves = getLCByObject(options, session, subset) # for k,v in lightcurves.items(): # print(k, v) cluster.shutdown()
def ingestDataMultiprocess(options): currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) nProcessors, fileSublist = splitList(options.inputFile, bins=int(options.nprocesses), preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess([], dateAndTime, nProcessors, fileSublist, workerIngest, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
def ingestDataMultiprocess(options, fkDict = None): currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) # Read the contents of the input file(s) to get the filenames to process. files = options.inputFile if options.fileoffiles: files = [] for f in options.inputFile: with open(f) as fp: content = fp.readlines() content = [filename.strip() for filename in content] files += content print(files) nProcessors, fileSublist = splitList(files, bins = int(options.nfileprocesses), preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess([], dateAndTime, nProcessors, fileSublist, workerIngest, miscParameters = [options, fkDict], drainQueues = False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
def ingestData(options, inputFiles): generateHtmidBulk = which('generate_htmid_bulk') if generateHtmidBulk is None: sys.stderr.write( "Can't find the generate_htmid_bulk executable, so cannot continue.\n" ) exit(1) import yaml with open(options.configFile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] db = { 'username': username, 'password': password, 'database': database, 'hostname': hostname } currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) for inputFile in inputFiles: print("Ingesting %s" % inputFile) if 'gz' in inputFile: # It's probably gzipped f = gzip.open(inputFile, 'rb') print(type(f).__name__) else: f = inputFile data = readGenericDataFile(f, delimiter=',', useOrderedDict=True) pid = os.getpid() tempRADecFile = '/tmp/' + os.path.basename(inputFile) + 'radec_' + str( pid) tempLoadFile = '/tmp/' + os.path.basename(inputFile) + '_' + str( pid) + '.csv' with open(tempRADecFile, 'wb') as f: for row in data: f.write('%s %s\n' % (row['ra'], row['dec'])) htm10IDs = calculate_htm_ids_bulk(generateHtmidBulk, 10, tempRADecFile) htm13IDs = calculate_htm_ids_bulk(generateHtmidBulk, 13, tempRADecFile) htm16IDs = calculate_htm_ids_bulk(generateHtmidBulk, 16, tempRADecFile) os.remove(tempRADecFile) for i in range(len(data)): # Add the HTM IDs to the data data[i]['htm10ID'] = htm10IDs[i] data[i]['htm13ID'] = htm13IDs[i] data[i]['htm16ID'] = htm16IDs[i] nprocesses = int(options.nprocesses) if len(data) > 0: nProcessors, listChunks = splitList(data, bins=nprocesses, preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerInsert, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
def ingestData(options, inputFiles, fkDict = None): import yaml with open(options.configFile) as yaml_file: config = yaml.safe_load(yaml_file) username = config['cassandra']['local']['username'] password = config['cassandra']['local']['password'] keyspace = config['cassandra']['local']['keyspace'] hostname = config['cassandra']['local']['hostname'] db = {'username': username, 'password': password, 'keyspace': keyspace, 'hostname': hostname} currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) delimiter=options.tableDelimiter if delimiter == '\\s': delimiter = ' ' if delimiter == '\\t': delimiter = '\t' for inputFile in inputFiles: print("Ingesting %s" % inputFile) if '.gz' in inputFile: # It's probably gzipped f = gzip.open(inputFile, 'rb') print(type(f).__name__) else: f = inputFile if 'avro' in inputFile: # Data is in Avro packets, with schema. Let's hard-wire to the ZTF schema for the time being. avroData = readZTFAvroPacket(f, addhtm16 = True) if 'noncandidates' in options.table: data = avroData['noncandidates'] elif 'candidates' in options.table: data = avroData['candidates'] else: print("Error. Incorrect table definition for Avro packets. Must contain candidates or noncandidates.") exit(1) else: # Data is in plain text file. No schema present, so will need to provide # column types. data = readGenericDataFile(f, delimiter=delimiter, useOrderedDict=True) # 2021-07-29 KWS This is a bit inefficient, but trim the data down to specified columns if they are present. if options.columns: trimmedData = [] for row in data: trimmedRow = {key: row[key] for key in options.columns.split(',')} trimmedData.append(trimmedRow) data = trimmedData foreignKey = options.fkfrominputdata if foreignKey == 'filename': foreignKey = os.path.basename(inputFile).split('.')[0] if fkDict: for i in range(len(data)): try: if options.fktablecols: # just pick out the specified keys keys = options.fktablecols.split(',') for k in keys: data[i][k] = fkDict[foreignKey][k] else: # Use all the keys by default for k,v in fkDict[foreignKey].items(): data[i][k] = v except KeyError as e: pass #print(data[0]) pid = os.getpid() if not options.skiphtm: coords = [] for row in data: coords.append([float(row[options.racol]), float(row[options.deccol])]) htm16Names = htmNameBulk(16, coords) # For Cassandra, we're going to split the HTM Name across several columns. # Furthermore, we only need to do this once for the deepest HTM level, because # This is always a subset of the higher levels. Hence we only need to store # the tail end of the HTM name in the actual HTM 16 column. So... we store # the full HTM10 name as the first 12 characters of the HTM 16 one, then the # next 3 characters into the HTM 13 column, then the next 3 characters (i.e. # the last few characters) the HTM 16 column # e.g.: # ra, dec = 288.70392, 9.99498 # HTM 10 = N02323033011 # HTM 13 = N02323033011 211 # HTM 16 = N02323033011 211 311 # Incidentally, this hierarchy also works in binary and we should seriously # reconsider how we are currently using HTMs. # HTM10 ID = 13349829 = 11 00 10 11 10 11 00 11 11 00 01 01 # HTM13 ID = 854389093 = 11 00 10 11 10 11 00 11 11 00 01 01 10 01 01 # HTM16 ID = 54680902005 = 11 00 10 11 10 11 00 11 11 00 01 01 10 01 01 11 01 01 for i in range(len(data)): # Add the HTM IDs to the data data[i]['htm10'] = htm16Names[i][0:12] data[i]['htm13'] = htm16Names[i][12:15] data[i]['htm16'] = htm16Names[i][15:18] nprocesses = int(options.nprocesses) if len(data) > 0: nProcessors, listChunks = splitList(data, bins = nprocesses, preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerInsert, miscParameters = [options], drainQueues = False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
def main(argv=None): """main. Args: argv: """ opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) options = Struct(**opts) configFile = options.configfile import yaml with open(configFile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] MAX_NUMBER_OF_OBJECTS = int( config['postage_stamp_parameters']['max_number_of_objects']) db = [] db.append(username) db.append(password) db.append(database) db.append(hostname) detectionList = 1 customList = None conn = dbConnect(hostname, username, password, database) update = options.update limit = int(options.limit) limitafter = int(options.limitafter) mlscore = None if options.mlscore is not None: mlscore = float(options.mlscore) objectList = [] flagDate = '2015-12-20' if options.flagdate is not None: try: flagDate = '%s-%s-%s' % (options.flagdate[0:4], options.flagdate[4:6], options.flagdate[6:8]) except: flagDate = '2015-12-20' if options.candidate is not None and len(options.candidate) > 0: for cand in options.candidate: obj = getATLASObject(conn, objectId=int(cand)) if obj: objectList.append(obj) else: if options.customlist is not None: if int(options.customlist) > 0 and int(options.customlist) < 100: customList = int(options.customlist) objectList = getObjectsByCustomList(conn, customList, processingFlags=0) else: print( "The list must be between 1 and 100 inclusive. Exiting.") sys.exit(1) else: if options.detectionlist is not None: if int(options.detectionlist) >= 0 and int( options.detectionlist) < 9: detectionList = int(options.detectionlist) objectList = getObjectsByList(conn, listId=detectionList, dateThreshold=flagDate, processingFlags=0) else: print( "The list must be between 0 and 9 inclusive. Exiting." ) sys.exit(1) print("LENGTH OF OBJECTLIST = ", len(objectList)) if mlscore is not None and not ( options.candidate ): # Only do this filter if the IDs are not provided explicitly. updatedList = [] for row in objectList: if row['zooniverse_score'] is not None and row[ 'zooniverse_score'] >= mlscore: updatedList.append(row) if len(updatedList) > 0: objectList = updatedList print("LENGTH OF CLIPPED OBJECTLIST = ", len(objectList)) currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) # Single threaded #perObjectExps, exposureSet = getForcedPhotometryUniqueExposures(conn, objectList, discoveryLimit = limit, ddc = options.ddc, useFlagDate = options.useflagdate) perObjectExps, exposureSet = getForcedPhotometryUniqueExposures( conn, objectList, discoveryLimit=limit, cutoffLimit=limitafter, ddc=options.ddc, useFlagDate=options.useflagdate) if options.test: for obj in objectList: print(obj['id']) for exp in perObjectExps[obj['id']]['exps']: print(exp) return 0 # We'll hand the entire perObjectExps dictionary to each thread. # Download threads with multiprocessing - try 10 threads by default print("TOTAL OBJECTS = %d" % len(exposureSet)) print("Downloading exposures...") if not options.skipdownload: if len(exposureSet) > 0: nProcessors, listChunks = splitList(exposureSet, bins=int( options.downloadthreads)) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerExposureDownloader, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) # Belt and braces - try again with one less thread, just in case the previous one failed. nProcessors, listChunks = splitList( exposureSet, bins=int(options.downloadthreads) - 1) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerExposureDownloader, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) # Produce stamps with multiprocessing - try n(CPUs) threads by default print("Doing Forced Photometry...") if len(objectList) > 0: nProcessors, listChunks = splitList(objectList) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) objectsForUpdate = parallelProcess( db, dateAndTime, nProcessors, listChunks, workerForcedPhotometry, miscParameters=[options, perObjectExps]) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) if len(objectsForUpdate) > 0 and update: insertForcedPhotometry(conn, objectsForUpdate) conn.close() return 0
def main(): opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) options = Struct(**opts) configFile = options.configfile regex = options.regex import yaml with open(configFile) as yaml_file: config = yaml.safe_load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] db = [] db.append(username) db.append(password) db.append(database) db.append(hostname) conn = dbConnect(hostname, username, password, database) warnings.filterwarnings("ignore") # Parse command line currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) pid = int(options.pid) maxjobs = int(options.maxjobs) days = int(options.days) camera = options.camera try: mjdToIngest = options.mjd except TypeError as e: mjdToIngest = None print("camera =", camera) print("regex =", regex) todayMJD = getCurrentMJD() # Use + 1 to include today! mjdthreshold = int(todayMJD) - days + 1 # Specified MJD trumps mjd Threshold, so just go as far back # as the specified date if mjdToIngest: mjdthreshold = int(mjdToIngest[0:5]) - 1 ingester = options.ingester fileList = getFiles(regex, camera, mjdToIngest=mjdToIngest, mjdthreshold=mjdthreshold, days=days, atlasroot=options.atlasroot, options=options) ingestedFiles = getFilesIngestedddc2(conn, mjdthreshold=mjdthreshold, camera=camera) fileListDict = OrderedDict() print("List of files...") for row in fileList: fileListDict[os.path.basename(row)] = row print(row) print("List of ingested files...") for row in ingestedFiles: print(row) filesToIngest = [ fileListDict[x] for x in list(set(fileListDict.keys()) - set(ingestedFiles)) ] filesToIngest.sort() print("List of files to ingest...") for row in filesToIngest: print(row) print("TOTAL OBJECTS TO CHECK = %d" % len(filesToIngest)) if len(fileList) > 0: # 2018-02-06 KWS Use half the default number of processes. This may ironically speed up ingest. nProcessors, listChunks = splitList(filesToIngest, bins=28) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, worker, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) conn.close() return 0
def main(argv=None): opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. options = Struct(**opts) #keyspace = 'atlas' #host = ['db0', 'db1', 'db2', 'db3', 'db4'] # random star #ra = 83.20546 #dec = -20.70055 # ATLAS17nij #ra = 82.46704 #dec = -19.52058 # ATLAS20biio #ra = 83.24691 #dec = -19.11739 # ATLAS20bbio - very good!! #ra = 81.27903 #dec = -21.24643 # ATLAS18vre #ra = 84.19551 #dec = -22.41100 # ATLAS19bdbm #ra = 85.10436 #dec = -18.09766 # ATLAS20bbff #ra = 86.52075 #dec = -23.56601 # ATLAS20ymv - THIS IS the CENTRE OBJECT. We did a 10 degree sweep around this. #ra = 74.55677 #dec = -20.35753 # ATLAS17lvn - bright foreground star #ra = 68.75953 #dec = -14.22797 import yaml with open(options.configFile) as yaml_file: config = yaml.safe_load(yaml_file) username = config['cassandra']['local']['username'] password = config['cassandra']['local']['password'] keyspace = config['cassandra']['local']['keyspace'] hostname = config['cassandra']['local']['hostname'] db = { 'username': username, 'password': password, 'keyspace': keyspace, 'hostname': hostname } coordslist = [] if options.coordsfromfile: coordslist = readGenericDataFile(options.coords, delimiter=',') else: coordslist.append({ 'ra': options.coords.split(',')[0], 'dec': options.coords.split(',')[1] }) if options.number and int(options.number) < len(coordslist): coordslist = random.sample(coordslist, int(options.number)) if int(options.nprocesses) > 1 and len(coordslist) > 1: # Do it in parallel! currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) nProcessors, listChunks = splitList(coordslist, bins=int(options.nprocesses), preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, worker, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) else: cluster = Cluster(db['hostname']) session = cluster.connect() session.row_factory = dict_factory session.set_keyspace(db['keyspace']) getLCData(options, session, coordslist) cluster.shutdown()
def main(): """main. """ opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) options = Struct(**opts) configFile = options.configfile import yaml with open(configFile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] MAX_NUMBER_OF_OBJECTS = int( config['postage_stamp_parameters']['max_number_of_objects']) db = [] db.append(username) db.append(password) db.append(database) db.append(hostname) detectionList = 1 customList = None conn = dbConnect(hostname, username, password, database) update = options.update limit = int(options.limit) mostRecent = not (options.earliest) nondetections = options.nondetections discoverylimit = int(options.discoverylimit) lastdetectionlimit = int(options.lastdetectionlimit) objectList = [] try: requestType = REQUESTTYPES[options.requesttype] except KeyError as e: requestType = REQUESTTYPES['incremental'] print("REQUEST TYPE = ", requestType) flagDate = '2015-12-20' if options.flagdate is not None: try: flagDate = '%s-%s-%s' % (options.flagdate[0:4], options.flagdate[4:6], options.flagdate[6:8]) except: flagDate = '2015-12-20' if options.candidate is not None and len(options.candidate) > 0: for cand in options.candidate: objectList.append({'id': int(cand)}) else: if options.customlist is not None: if int(options.customlist) > 0 and int(options.customlist) < 100: customList = int(options.customlist) objectList = getObjectsByCustomList(conn, customList) else: print( "The list must be between 1 and 100 inclusive. Exiting.") sys.exit(1) else: if options.detectionlist is not None: if int(options.detectionlist) >= 0 and int( options.detectionlist) < 9: detectionList = int(options.detectionlist) objectList = getObjectsByList(conn, listId=detectionList, dateThreshold=flagDate) else: print( "The list must be between 0 and 6 inclusive. Exiting." ) sys.exit(1) currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) if len(objectList) > MAX_NUMBER_OF_OBJECTS: sys.stderr.write( "The number of objects (%d) exceeds the maximum allowed (%d). Cannot continue.\n" % (len(objectList), MAX_NUMBER_OF_OBJECTS)) sys.exit(1) # Only download exposures if requested. Otherwise assume we already HAVE the data. if not options.skipdownload: exposureSet = getUniqueExposures(conn, objectList, limit=limit, mostRecent=mostRecent, nonDets=nondetections, discoveryLimit=discoverylimit, lastDetectionLimit=lastdetectionlimit, requestType=requestType, ddc=options.ddc) # Download threads with multiprocessing - try 10 threads by default print("TOTAL OBJECTS = %d" % len(exposureSet)) print("Downloading exposures...") if len(exposureSet) > 0: nProcessors, listChunks = splitList(exposureSet, bins=int( options.downloadthreads)) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerImageDownloader, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) # Belt and braces. Do again, with one less thread. nProcessors, listChunks = splitList( exposureSet, bins=int(options.downloadthreads) - 1) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerImageDownloader, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) # Produce stamps with multiprocessing - try n(CPUs) threads by default print("Producing stamps...") if len(objectList) > 0: nProcessors, listChunks = splitList(objectList, bins=48) print("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerStampCutter, miscParameters=[ limit, mostRecent, nondetections, discoverylimit, lastdetectionlimit, requestType, options.ddc, options.wpwarp, options ], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) conn.close() return 0