def getPS1TrainingSetCutouts(opts): if type(opts) is dict: options = Struct(**opts) else: options = opts import yaml with open(options.configFile) as yaml_file: config = yaml.load(yaml_file) stampLocation = options.stampLocation if not os.path.exists(stampLocation): os.makedirs(stampLocation) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] conn = dbConnect(hostname, username, password, database) if not conn: print("Cannot connect to the database") return 1 images = getTrainingSetImages(conn) writePS1GoodBadFiles(options.stampLocation, images) conn.close()
def runKerasTensorflowClassifierMultiprocess(opts): # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. if type(opts) is dict: options = Struct(**opts) else: options = opts import yaml with open(options.configFile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] db = [] conn = dbConnect(hostname, username, password, database) if not conn: print("Cannot connect to the database") return 1 # If the list isn't specified assume it's the Eyeball List. if options.listid is not None: try: detectionList = int(options.listid) if detectionList < 0 or detectionList > 8: print ("Detection list must be between 0 and 8") return 1 except ValueError as e: sys.exit("Detection list must be an integer") # 2018-07-31 KWS We have PS1 data. Don't bother with the HKO/MLO ATLAS data. ps1Data = False if options.ps1classifier: ps1Data = True objectList = [] # if candidates are specified in the options, then override the list. if len(options.candidate) > 0: objectList = [{'id': int(candidate)} for candidate in options.candidate] else: objectList = getObjectsByList(conn, database, listId = int(options.listid), ps1Data = ps1Data) # 2019-06-07 KWS For reasons not entirely clear, Tensorflow seems to exhaust every last # bit of CPU and memory. So let's divide the list by 10 if the list is # larger than 10000 in size. if len(objectList) > 100: bin, subLists = splitList(objectList, bins=16) else: subLists = [objectList] for l in subLists: currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) objectsForUpdate = [] if len(objectList) > 0: # 2019-08-24 KWS Hard-wire the number of workers. nProcessors, listChunks = splitList(l, bins=28) print ("%s Parallel Processing..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) objectsForUpdate = parallelProcess(db, dateAndTime, nProcessors, listChunks, worker, miscParameters = [options]) print ("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) print ("TOTAL OBJECTS TO UPDATE = %d" % len(objectsForUpdate)) # if len(objectsForUpdate) > 0 and options.update: # updateObjects(conn, objectsForUpdate) # Sort the combined list. objectsForUpdate = sorted(objectsForUpdate, key = lambda x: x[1]) if options.outputcsv is not None: with open(options.outputcsv, 'w') as f: for row in objectsForUpdate: print(row[0], row[1]) f.write('%s,%f\n' % (row[0], row[1])) if options.update: for row in objectsForUpdate: updateTransientRBValue(conn, row[0], row[1], ps1Data = ps1Data) conn.close()
def runKerasTensorflowClassifier(opts, processNumber=None): # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. if type(opts) is dict: options = Struct(**opts) else: options = opts import yaml with open(options.configFile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] conn = dbConnect(hostname, username, password, database) if not conn: print("Cannot connect to the database") return 1 # 2018-07-31 KWS We have PS1 data. Don't bother with the HKO/MLO ATLAS data. ps1Data = False if options.ps1classifier: ps1Data = True if options.listid is not None: try: detectionList = int(options.listid) if detectionList < 0 or detectionList > 8: print("Detection list must be between 0 and 8") return 1 except ValueError as e: sys.exit("Detection list must be an integer") objectList = [] imageFilenames = [] # if candidates are specified in the options, then override the list. if len(options.candidate) > 0: objectList = [{ 'id': int(candidate) } for candidate in options.candidate] else: # Only collect by the list ID if we are running in single threaded mode if processNumber is None: objectList = getObjectsByList(conn, database, listId=int(options.listid), ps1Data=ps1Data) if len(objectList) > 0: imageFilenames = getImages(conn, database, objectList, imageRoot=options.imageroot) if len(imageFilenames) == 0: print("NO IMAGES") conn.close() return [] if ps1Data: objectDictPS1 = getRBValues([f['filename'] for f in imageFilenames], options.ps1classifier, extension=1) objectScores = defaultdict(dict) for k, v in list(objectDictPS1.items()): objectScores[k]['ps1'] = np.array(v) finalScores = {} objects = list(objectScores.keys()) for object in objects: finalScores[object] = np.median(objectScores[object]['ps1']) else: # Split the images into HKO and MLO data so we can apply the HKO and MLO machines separately. hkoFilenames = [] for row in imageFilenames: if '02a' in row['filename']: hkoFilenames.append(row['filename']) mloFilenames = [] for row in imageFilenames: if '01a' in row['filename']: mloFilenames.append(row['filename']) #filename = 'hko_57966_20x20_skew3_signpreserve_f77475b232425.mat' #train_data, test_data, image_dim = load_data(filename) #x_test = test_data[0] #hkoClassifier = '/home/kws/keras/hko_57966_20x20_skew3_signpreserve_f77475b232425.model.best.hdf5' #mloClassifier = '/home/kws/keras/atlas_mlo_57925_20x20_skew3_signpreserve_f331184b993662.model.best.hdf5' objectDictHKO = getRBValues(hkoFilenames, options.hkoclassifier) objectDictMLO = getRBValues(mloFilenames, options.mloclassifier) # Now we have two dictionaries. Combine them. objectScores = defaultdict(dict) for k, v in list(objectDictHKO.items()): objectScores[k]['hko'] = np.array(v) for k, v in list(objectDictMLO.items()): objectScores[k]['mlo'] = np.array(v) # Some objects will have data from two telescopes, some only one. # If we have data from two telescopes, choose the median value of the longest length list. finalScores = {} objects = list(objectScores.keys()) for object in objects: if len(objectScores[object]) > 1: hkoLen = len(objectScores[object]['hko']) mloLen = len(objectScores[object]['mlo']) if mloLen > hkoLen: finalScores[object] = np.median( objectScores[object]['mlo']) else: # Only if MLO is larger than HKO, use MLO. Otherise use HKO finalScores[object] = np.median( objectScores[object]['hko']) else: try: finalScores[object] = np.median( objectScores[object]['hko']) except KeyError as e: finalScores[object] = np.median( objectScores[object]['mlo']) finalScoresSorted = OrderedDict( sorted(list(finalScores.items()), key=lambda t: t[1])) if options.outputcsv is not None: prefix = options.outputcsv.split('.')[0] suffix = options.outputcsv.split('.')[-1] if suffix == prefix: suffix = '' if suffix: suffix = '.' + suffix processSuffix = '' if processNumber is not None: processSuffix = '_%02d' % processNumber # Generate the insert statements with open('%s%s%s' % (prefix, processSuffix, suffix), 'w') as f: for k, v in list(finalScoresSorted.items()): print(k, finalScoresSorted[k]) f.write('%s,%f\n' % (k, finalScoresSorted[k])) scores = list(finalScoresSorted.items()) if options.update and processNumber is None: # Only allow database updates in single threaded mode. Otherwise multithreaded code # does the updates at the end of processing. (Minimises table locks.) for row in scores: updateTransientRBValue(conn, row[0], row[1], ps1Data=ps1Data) conn.close() return scores
def getATLASTrainingSetCutouts(opts): if type(opts) is dict: options = Struct(**opts) else: options = opts import yaml with open(options.configFile) as yaml_file: config = yaml.load(yaml_file) stampSize = int(options.stampSize) mjds = options.mjds if not mjds: print("No MJDs specified") return 1 downloadThreads = int(options.downloadthreads) stampThreads = int(options.stampThreads) stampLocation = options.stampLocation if not os.path.exists(stampLocation): os.makedirs(stampLocation) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] conn = dbConnect(hostname, username, password, database) if not conn: print("Cannot connect to the database") return 1 currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S") (year, month, day, hour, min, sec) = currentDate.split(':') dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec) asteroidExpsDict = defaultdict(list) for mjd in mjds: asteroidExps = getKnownAsteroids(conn, options.camera, int(mjd), pkn=900) for exp in asteroidExps: asteroidExpsDict[exp['obs']].append(exp) # Now create the files. We need to have x, y as the first two items. #m.obs, d.x, d.y, d.mag, d.dmag, d.ra, d.dec header = "x,y,mag,dmag,ra,dec,obs".split(',') exposureList = [] for k, v in asteroidExpsDict.items(): exposureList.append(k) with open(stampLocation + '/' + 'good' + k + '.txt', 'w') as csvfile: w = csv.DictWriter(csvfile, fieldnames=header, delimiter=' ') #w.writeheader() for row in v: w.writerow(row) # So now let stampstorm do its stuff if len(exposureList) > 0: nProcessors, listChunks = splitList(exposureList, bins=stampThreads) print("%s Parallel Processing Good objects..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess([], dateAndTime, nProcessors, listChunks, workerStampStorm, miscParameters=[stampSize, stampLocation, 'good'], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) junkExpsDict = defaultdict(list) for mjd in mjds: junkExps = getJunk(conn, options.camera, int(mjd)) for exp in junkExps: junkExpsDict[exp['obs']].append(exp) exposureList = [] for k, v in junkExpsDict.items(): exposureList.append(k) with open(stampLocation + '/' + 'bad' + k + '.txt', 'w') as csvfile: w = csv.DictWriter(csvfile, fieldnames=header, delimiter=' ') #w.writeheader() for row in v: w.writerow(row) if len(exposureList) > 0: nProcessors, listChunks = splitList(exposureList, bins=stampThreads) print("%s Parallel Processing Bad objects..." % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) parallelProcess([], dateAndTime, nProcessors, listChunks, workerStampStorm, miscParameters=[stampSize, stampLocation, 'bad'], drainQueues=False) print("%s Done Parallel Processing" % (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S"))) conn.close() getGoodBadFiles(stampLocation)
def main(argv=None): opts = docopt(__doc__, version='0.1') opts = cleanOptions(opts) # Use utils.Struct to convert the dict into an object for compatibility with old optparse code. options = Struct(**opts) json_path = options.data schema_files = options.schema cutoutsci_path = options.cutoutSci cutouttemp_path = options.cutoutTemp cutoutdiff_path = options.cutoutDiff mjdThreshold = float(options.mjdThreshold) alert_schema = combine_schemas(schema_files) import yaml with open(options.configfile) as yaml_file: config = yaml.load(yaml_file) username = config['databases']['local']['username'] password = config['databases']['local']['password'] database = config['databases']['local']['database'] hostname = config['databases']['local']['hostname'] # Now that we have all the data, we need to construct a properly formed alert - FOR EACH ROW. # NOTE - To get this to work, feed it junk test.json data. # The alerts can be written to a file - but don't forget, we are using the schemaless writer # which means that the schemaless reader MUST be used to read the data! alert = None # If we just have some json data, process it. Otherwise read from the database. if json_path: with open(json_path) as file_text: json_data = json.load(file_text) avro_bytes = write_avro_data(json_data, alert_schema) # Load science stamp if included if cutoutsci_path is not None: cutoutTemplate = load_stamp(cutoutsci_path) json_data['cutoutScience'] = cutoutTemplate # Load template stamp if included if cutouttemp_path is not None: cutoutTemplate = load_stamp(cutouttemp_path) json_data['cutoutTemplate'] = cutoutTemplate # Load difference stamp if included if cutoutdiff_path is not None: cutoutDifference = load_stamp(cutoutdiff_path) json_data['cutoutDifference'] = cutoutDifference if options.writeFile: with open('/tmp/alert.avro', 'wb') as f: # NOTE - This code writes a schemaless message. To read it we need to pass the schema # to the reader. How we pass this message to Kafka is the next problem to be # resolved. avro_bytes.seek(0) data = avro_bytes.read() f.write(data) #m = read_avro_data_from_file('alert.avro', alert_schema) m = read_avro_data(avro_bytes, alert_schema) if options.readMessage: if m: # Print message text to screen message_text = { k: m[k] for k in m if k not in ['cutoutScience', 'cutoutDifference', 'cutoutTemplate'] } print(message_text) # Collect stamps as files written to local directory 'output' and check hashes match expected if m.get('cutoutScience') is not None: stamp_temp_out = write_stamp_file( message.get('cutoutScience'), 'output') print('Science stamp ok:', check_md5(args.cutoutSci, stamp_temp_out)) if m.get('cutoutTemplate') is not None: stamp_temp_out = write_stamp_file( message.get('cutoutTemplate'), 'output') print('Template stamp ok:', check_md5(args.cutoutTemp, stamp_temp_out)) if m.get('cutoutDifference') is not None: stamp_diff_out = write_stamp_file( message.get('cutoutDifference'), 'output') print("size in bytes of json text: %d" % sys.getsizeof(message_text)) raw_bytes = avro_bytes.getvalue() print("size in bytes of avro message: %d" % sys.getsizeof(raw_bytes)) print("size in bytes of json text: %d" % sys.getsizeof(message_text)) raw_bytes = avro_bytes.getvalue() print("size in bytes of avro message: %d" % sys.getsizeof(raw_bytes)) return 0 conn = dbConnect(hostname, username, password, database) if not conn: print("Cannot connect to the database") return 1 # Connect to the database and read out the ATLAS detections. records = getATLASIngestedDetections(conn, mjdThreshold) conn.close() alerts = [] if options.writeFile and options.bulkMessage: for row in records: alert = { 'alertId': row['db_det_id'], 'atlas_object_id': row['atlas_object_id'], 'candidate': row } alerts.append(alert) write_avro_data_to_file_with_schema('/tmp/alerts_bulk.avro', alert_schema, alerts) return for row in records: alert = { 'alertId': row['db_det_id'], 'atlas_object_id': row['atlas_object_id'], 'candidate': row } avro_bytes = write_avro_data(alert, alert_schema) if options.readMessage: #m = read_avro_data_from_file('alert.avro', alert_schema) m = read_avro_data(avro_bytes, alert_schema) if m: # Print message text to screen message_text = { k: m[k] for k in m if k not in ['cutoutScience', 'cutoutDifference', 'cutoutTemplate'] } print(message_text) # Collect stamps as files written to local directory 'output' and check hashes match expected if m.get('cutoutScience') is not None: stamp_temp_out = write_stamp_file( message.get('cutoutScience'), 'output') print('Science stamp ok:', check_md5(args.cutoutSci, stamp_temp_out)) if m.get('cutoutTemplate') is not None: stamp_temp_out = write_stamp_file( message.get('cutoutTemplate'), 'output') print('Template stamp ok:', check_md5(args.cutoutTemp, stamp_temp_out)) if m.get('cutoutDifference') is not None: stamp_diff_out = write_stamp_file( message.get('cutoutDifference'), 'output') print("size in bytes of json text: %d" % sys.getsizeof(message_text)) raw_bytes = avro_bytes.getvalue() print("size in bytes of avro message: %d" % sys.getsizeof(raw_bytes)) if options.writeFile: if not options.bulkMessage: f = open('/tmp/alert_%s.avro' % row['db_det_id'], 'wb') avro_bytes.seek(0) data = avro_bytes.read() f.write(data) if not options.bulkMessage: f.close() return 0
def getTNSData(opts): if type(opts) is dict: options = Struct(**opts) else: options = opts # import yaml # with open(options.configFile) as yaml_file: # config = yaml.load(yaml_file) username = settings.DB_USER_WRITE password = settings.DB_PASS_WRITE hostname = settings.DB_HOST database = 'ztf' # database = settings.DB # username = config['databases']['local']['username'] # password = config['databases']['local']['password'] # database = config['databases']['local']['database'] # hostname = config['databases']['local']['hostname'] conn = dbConnect(hostname, username, password, database) if not conn: print( "ERROR in services/TNS/poll_tns: Cannot connect to the database\n") return 1 radius = 3.0 # arcseconds from crossmatch if options.radius: radius = float(options.radius) inLastNumberOfDays = None if options.inLastNumberOfDays: inLastNumberOfDays = int(options.inLastNumberOfDays) status_code, content = pollTNS(page=int(options.pageNumber), resultSize=int(options.pageSize), inLastNumberDays=inLastNumberOfDays) # csvEntries = csv.DictReader(content.splitlines(), delimiter=',') # data = csvEntries data = csv.DictReader(content.splitlines(), delimiter=',') rowsAdded = 0 rowsChanged = 0 for row in data: if 'Name' in row: name = row['Name'].strip().split() else: print(row) sys.exit() if len(name) != 2: prefix = 'SN' suffix = name[0] else: prefix = name[0] suffix = name[1] if row['Discovery Date (UT)'].strip() == '0000-00-00 00:00:00': # Set the discovery date to January of the suffix name. discoveryDate = '%s-01-01 00:00:00' % suffix[0:4] row['Discovery Date (UT)'] = discoveryDate # if not 'Type' in row: # sometimes TNS does not supply Type -- RDW # row['Type'] = 'null' row['prefix'] = prefix row['suffix'] = suffix ra, dec = coords_sex_to_dec(row['RA'], row['DEC']) if ra == 0 and dec == 0: print( "in services/TNS/poll_tns: Cannot store record for %s. No coordinates provided!\n" % row['Name'].strip()) continue row['ra'] = ra row['dec'] = dec htm16 = htmCircle.htmID(16, ra, dec) row['htm16'] = htm16 tnsEntry = getTNSRow(conn, suffix) if tnsEntry: if tnsEntry['tns_prefix'] != prefix: # The entry has been updated on TNS - classified! Otherwise do nothing! deleteTNSRow(conn, suffix) insertTNS(conn, row) print("Object %s has been updated\n" % row['suffix']) rowsChanged += 1 else: insertTNS(conn, row) print("Object %s has been added\n" % row['suffix']) run_tns_crossmatch.tns_name_crossmatch(\ conn, row['suffix'], ra, dec, radius) rowsAdded += 1 #print prefix, suffix, ra, dec, htm16, row['Discovery Date (UT)'] print("Total rows added = %d, modified = %d\n" % (rowsAdded, rowsChanged)) conn.commit() conn.close()