def main(argv = None): opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. options = Struct(**opts) fkDict = {} # If we have a foreign key table, read the data once only. Pass this to the subprocesses. if options.fktable: fkeys = readGenericDataFile(options.fktable, delimiter='\t') for row in fkeys: fkDict[row[options.fkfield]] = row ingestDataMultiprocess(options, fkDict = fkDict)
def ingestData(options, inputFiles): generateHtmidBulk = which('generate_htmid_bulk') if generateHtmidBulk is None: sys.stderr.write( "Can't find the generate_htmid_bulk executable, so cannot continue.\n" ) exit(1) import yaml with open(options.configFile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] db = { 'username': username, 'password': password, 'database': database, 'hostname': hostname } currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) for inputFile in inputFiles: print("Ingesting %s" % inputFile) if 'gz' in inputFile: # It's probably gzipped f = gzip.open(inputFile, 'rb') print(type(f).__name__) else: f = inputFile data = readGenericDataFile(f, delimiter=',', useOrderedDict=True) pid = os.getpid() tempRADecFile = '/tmp/' + os.path.basename(inputFile) + 'radec_' + str( pid) tempLoadFile = '/tmp/' + os.path.basename(inputFile) + '_' + str( pid) + '.csv' with open(tempRADecFile, 'wb') as f: for row in data: f.write('%s %s\n' % (row['ra'], row['dec'])) htm10IDs = calculate_htm_ids_bulk(generateHtmidBulk, 10, tempRADecFile) htm13IDs = calculate_htm_ids_bulk(generateHtmidBulk, 13, tempRADecFile) htm16IDs = calculate_htm_ids_bulk(generateHtmidBulk, 16, tempRADecFile) os.remove(tempRADecFile) for i in range(len(data)): # Add the HTM IDs to the data data[i]['htm10ID'] = htm10IDs[i] data[i]['htm13ID'] = htm13IDs[i] data[i]['htm16ID'] = htm16IDs[i] nprocesses = int(options.nprocesses) if len(data) > 0: nProcessors, listChunks = splitList(data, bins=nprocesses, preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerInsert, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
def main(argv = None): opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. options = Struct(**opts) atlasCentres = readGenericDataFile(options.atlasCentresFile, delimiter='\t') atlasRowLen = len(atlasCentres[0].keys()) inputCoords = readGenericDataFile(options.inputCoordsFile, delimiter=',') radius = 3.86 try: radius = float(options.searchradius) except ValueError as e: pass if options.footprints: for row in inputCoords: if options.debug: print(row) try: ra = float(row['ra']) dec = float(row['dec']) except ValueError as e: ra, dec = coords_sex_to_dec(row['ra'], row['dec']) for r in atlasCentres: if isObjectInsideATLASFootprint(ra, dec, float(r['ra']), float(r['dec'])): if options.checkmjd: if abs(float(r['mjd']) - float(row['mjd'])) < float(options.mjdtolerance): matches = doRegexMatch(r['expname']) if matches: red = '' if options.red: red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + r['expname'] + '.fits.fz' print(row['name'], red) else: print(row['name'], r['expname']) else: print(row['name'], r['expname']) else: matches = doRegexMatch(r['expname']) if matches: red = '' if options.red: red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + r['expname'] + '.fits.fz' print(row['name'], red) else: print(row['name'], r['expname']) else: print(row['name'], r['expname']) else: for row in inputCoords: if options.debug: print(row) try: ra = float(row['ra']) dec = float(row['dec']) except ValueError as e: ra, dec = coords_sex_to_dec(row['ra'], row['dec']) header, results = bruteForceGenericConeSearch(options.atlasCentresFile, [[ra, dec]], radius*3600.0, raIndex = 'ra', decIndex = 'dec') for r in results: if options.checkmjd: exps = r.split() if abs(float(exps[3]) - float(row['mjd'])) < float(options.mjdtolerance): matches = doRegexMatch(exps[0]) if matches: red = '' if options.red: red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + exps[0] + '.fits.fz' print (row['name'], red, "%.2f" % (float(exps[atlasRowLen+1])/3600.0)) else: print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0)) else: print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0)) else: exps = r.split() matches = doRegexMatch(exps[0]) if matches: red = '' if options.red: red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + exps[0] + '.fits.fz' print (row['name'], red, "%.2f" % (float(exps[atlasRowLen+1])/3600.0)) else: print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0)) else: print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0))
def setup(options): inputData = readGenericDataFile(options.filename) return inputData
def ingestData(options, inputFiles, fkDict = None): import yaml with open(options.configFile) as yaml_file: config = yaml.safe_load(yaml_file) username = config['cassandra']['local']['username'] password = config['cassandra']['local']['password'] keyspace = config['cassandra']['local']['keyspace'] hostname = config['cassandra']['local']['hostname'] db = {'username': username, 'password': password, 'keyspace': keyspace, 'hostname': hostname} currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) delimiter=options.tableDelimiter if delimiter == '\\s': delimiter = ' ' if delimiter == '\\t': delimiter = '\t' for inputFile in inputFiles: print("Ingesting %s" % inputFile) if '.gz' in inputFile: # It's probably gzipped f = gzip.open(inputFile, 'rb') print(type(f).__name__) else: f = inputFile if 'avro' in inputFile: # Data is in Avro packets, with schema. Let's hard-wire to the ZTF schema for the time being. avroData = readZTFAvroPacket(f, addhtm16 = True) if 'noncandidates' in options.table: data = avroData['noncandidates'] elif 'candidates' in options.table: data = avroData['candidates'] else: print("Error. Incorrect table definition for Avro packets. Must contain candidates or noncandidates.") exit(1) else: # Data is in plain text file. No schema present, so will need to provide # column types. data = readGenericDataFile(f, delimiter=delimiter, useOrderedDict=True) # 2021-07-29 KWS This is a bit inefficient, but trim the data down to specified columns if they are present. if options.columns: trimmedData = [] for row in data: trimmedRow = {key: row[key] for key in options.columns.split(',')} trimmedData.append(trimmedRow) data = trimmedData foreignKey = options.fkfrominputdata if foreignKey == 'filename': foreignKey = os.path.basename(inputFile).split('.')[0] if fkDict: for i in range(len(data)): try: if options.fktablecols: # just pick out the specified keys keys = options.fktablecols.split(',') for k in keys: data[i][k] = fkDict[foreignKey][k] else: # Use all the keys by default for k,v in fkDict[foreignKey].items(): data[i][k] = v except KeyError as e: pass #print(data[0]) pid = os.getpid() if not options.skiphtm: coords = [] for row in data: coords.append([float(row[options.racol]), float(row[options.deccol])]) htm16Names = htmNameBulk(16, coords) # For Cassandra, we're going to split the HTM Name across several columns. # Furthermore, we only need to do this once for the deepest HTM level, because # This is always a subset of the higher levels. Hence we only need to store # the tail end of the HTM name in the actual HTM 16 column. So... we store # the full HTM10 name as the first 12 characters of the HTM 16 one, then the # next 3 characters into the HTM 13 column, then the next 3 characters (i.e. # the last few characters) the HTM 16 column # e.g.: # ra, dec = 288.70392, 9.99498 # HTM 10 = N02323033011 # HTM 13 = N02323033011 211 # HTM 16 = N02323033011 211 311 # Incidentally, this hierarchy also works in binary and we should seriously # reconsider how we are currently using HTMs. # HTM10 ID = 13349829 = 11 00 10 11 10 11 00 11 11 00 01 01 # HTM13 ID = 854389093 = 11 00 10 11 10 11 00 11 11 00 01 01 10 01 01 # HTM16 ID = 54680902005 = 11 00 10 11 10 11 00 11 11 00 01 01 10 01 01 11 01 01 for i in range(len(data)): # Add the HTM IDs to the data data[i]['htm10'] = htm16Names[i][0:12] data[i]['htm13'] = htm16Names[i][12:15] data[i]['htm16'] = htm16Names[i][15:18] nprocesses = int(options.nprocesses) if len(data) > 0: nProcessors, listChunks = splitList(data, bins = nprocesses, preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, workerInsert, miscParameters = [options], drainQueues = False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
def doForcedPhotometry(options, objectList, perObjectExps): """doForcedPhotometry. Args: objectList: perObjectExps: """ fphot = [] for candidate in objectList: #print candidate['id'], perObjectExps[candidate['id']]['avgRa'], perObjectExps[candidate['id']]['avgDec'] print("Running forced photometry for candidate", candidate['id']) #tphorce [options] <diffPath> <tphotOutputPath> <raDeg> <decDeg> <snrLimit> for exp in perObjectExps[candidate['id']]['exps']: camera = exp[0:3] mjd = exp[3:8] aux = '' imageName = ATLAS_ROOT + '/diff/' + camera + '/' + mjd + '/' + exp + '.diff.fz' # if camera == '02a' and int(mjd) <= 57771: # camera = '02a.ORIG' if int(mjd) >= 57350: aux = 'AUX/' tphName = ATLAS_ROOT + '/red/' + camera + '/' + mjd + '/' + aux + exp + '.tph' #print TPHORCE, imageName, tphName, perObjectExps[candidate['id']]['avgRa'], perObjectExps[candidate['id']]['avgDec'], '3.0' # 2022-07-21 KWS Added text=True to the Popen command. Ensures that the response comes back as text. p = subprocess.Popen([ options.tphorce, imageName, tphName, str(perObjectExps[candidate['id']]['avgRa']), str(perObjectExps[candidate['id']]['avgDec']), str(FORCED_PHOT_DET_SNR), str(FORCED_PHOT_LIMIT_SNR) ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) output, errors = p.communicate() if output.strip(): csvData = readGenericDataFile(io.StringIO(output), delimiter=',') # There should only be one CSV row data = None try: data = csvData[0] except IndexError as e: print("ERROR: This is not a CSV file. Output = %s" % str(output).strip()) if data: data['limiting_mag'] = False # {'mjd': '57604.56804193', 'magerr': 'NaN', 'filter': 'c', 'expname': '02a57604o0375c', 'snr': 'NaN', 'mag': 'NaN', 'zp': '22.160', 'id': 1012602110150218500L} if data['mag'] and '>' in data['mag']: data['mag'] = data['mag'].replace('>', '') data['limiting_mag'] = True data['id'] = candidate['id'] data['expname'] = exp data['ra'] = perObjectExps[candidate['id']]['avgRa'] data['dec'] = perObjectExps[candidate['id']]['avgDec'] data['snrdet'] = FORCED_PHOT_DET_SNR data['snrlimit'] = FORCED_PHOT_LIMIT_SNR # Clean up the data - replace 'NaN' with None for k, v in data.items(): if v == 'NaN': data[k] = None fphot.append(data) if errors.strip(): print(errors) return fphot
def main(argv=None): opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. options = Struct(**opts) #keyspace = 'atlas' #host = ['db0', 'db1', 'db2', 'db3', 'db4'] # random star #ra = 83.20546 #dec = -20.70055 # ATLAS17nij #ra = 82.46704 #dec = -19.52058 # ATLAS20biio #ra = 83.24691 #dec = -19.11739 # ATLAS20bbio - very good!! #ra = 81.27903 #dec = -21.24643 # ATLAS18vre #ra = 84.19551 #dec = -22.41100 # ATLAS19bdbm #ra = 85.10436 #dec = -18.09766 # ATLAS20bbff #ra = 86.52075 #dec = -23.56601 # ATLAS20ymv - THIS IS the CENTRE OBJECT. We did a 10 degree sweep around this. #ra = 74.55677 #dec = -20.35753 # ATLAS17lvn - bright foreground star #ra = 68.75953 #dec = -14.22797 import yaml with open(options.configFile) as yaml_file: config = yaml.safe_load(yaml_file) username = config['cassandra']['local']['username'] password = config['cassandra']['local']['password'] keyspace = config['cassandra']['local']['keyspace'] hostname = config['cassandra']['local']['hostname'] db = { 'username': username, 'password': password, 'keyspace': keyspace, 'hostname': hostname } coordslist = [] if options.coordsfromfile: coordslist = readGenericDataFile(options.coords, delimiter=',') else: coordslist.append({ 'ra': options.coords.split(',')[0], 'dec': options.coords.split(',')[1] }) if options.number and int(options.number) < len(coordslist): coordslist = random.sample(coordslist, int(options.number)) if int(options.nprocesses) > 1 and len(coordslist) > 1: # Do it in parallel! currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) nProcessors, listChunks = splitList(coordslist, bins=int(options.nprocesses), preserveOrder=True) print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess(db, dateAndTime, nProcessors, listChunks, worker, miscParameters=[options], drainQueues=False) print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) else: cluster = Cluster(db['hostname']) session = cluster.connect() session.row_factory = dict_factory session.set_keyspace(db['keyspace']) getLCData(options, session, coordslist) cluster.shutdown()