def SDSS_select(sql): '''pass an SQL query to SDSS and return a pandas dataframe in case of error, wait 10 seconds and try again; give up after 5 tries''' br = mechanize.Browser() br.set_handle_robots(False) tryCount = 0 while (True): tryCount += 1 try: br.open('http://skyserver.sdss.org/dr13/en/tools/search/sql.aspx', timeout=4) br.select_form(name='sql') br['cmd'] = sql br['format'] = ['csv'] response = br.submit() file_like = StringIO.StringIO(response.get_data()) df = pd.read_csv(file_like, skiprows=1) break except (mechanize.URLError, mechanize.HTTPError, httplib.BadStatusLine, pd.parser.CParserError) as e: if tryCount > 5: message = 'Unable to connect to SkyServer; trying again in 10 min' logging.exception(message) print message raise fn.DataAccessError(message) logging.exception(e) time.sleep(10) return df
def getWISE(entry): ''' get IR data from AllWISE Source Catalog attempts to query Irsa 5 times; if they keep failing, abort returns updated entry ''' ir_pos = coord.SkyCoord(entry['consensus']['ir_ra'], entry['consensus']['ir_dec'], unit=(u.deg, u.deg), frame='icrs') tryCount = 0 while ( True ): #in case of error, wait 10 sec and try again; give up after 5 tries tryCount += 1 try: table = Irsa.query_region(ir_pos, catalog='allwise_p3as_psd', radius=3. * u.arcsec) break except (astroquery.exceptions.TimeoutError, astroquery.exceptions.TableParseError) as e: if tryCount > 5: message = 'Unable to connect to IRSA; trying again in 10 min' logging.exception(message) print message raise fn.DataAccessError(message) logging.exception(e) time.sleep(10) except Exception as e: if 'Query failed' in str(e) or 'timed out' in str(e): if tryCount > 5: message = 'Unable to connect to IRSA; trying again in 10 min' logging.exception(message) print message raise fn.DataAccessError(message) logging.exception(e) time.sleep(10) else: raise if len(table): number_matches = 0 if table[0]['w1snr'] > 5: match = table[0] dist = match['dist'] number_matches += 1 else: match = None dist = np.inf if len(table) > 1: for row in table: if row['dist'] < dist and row['w1snr'] > 5: match = row dist = match['dist'] number_matches += 1 if match: wise_match = {'designation':'WISEA'+match['designation'], 'ra':match['ra'], 'dec':match['dec'], \ 'number_matches':np.int16(number_matches), \ 'w1mpro':match['w1mpro'], 'w1sigmpro':match['w1sigmpro'], 'w1snr':match['w1snr'], \ 'w2mpro':match['w2mpro'], 'w2sigmpro':match['w2sigmpro'], 'w2snr':match['w2snr'], \ 'w3mpro':match['w3mpro'], 'w3sigmpro':match['w3sigmpro'], 'w3snr':match['w3snr'], \ 'w4mpro':match['w4mpro'], 'w4sigmpro':match['w4sigmpro'], 'w4snr':match['w4snr']} else: wise_match = None else: wise_match = None if wise_match: logging.info('AllWISE match found') for key in wise_match.keys(): if wise_match[key] is np.ma.masked: wise_match.pop(key) elif wise_match[key] and type(wise_match[key]) is not str: wise_match[key] = wise_match[key].item() elif wise_match[key] == 0: wise_match[key] = 0 else: logging.info('No AllWISE match found') return wise_match
def RGZcatalog(): #start timer starttime = time.time() #begin logging even if not run from command line logging.basicConfig(filename='{}/{}'.format(rgz_path,logfile), level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.captureWarnings(True) #connect to database of subjects subjects = db['radio_subjects'] consensus = db['consensus{}'.format(version)] catalog = db['catalog{}'.format(version)] #this is being populated by this program if catalog.count(): logging.info('Catalog contains entries; appending') else: catalog.create_index('catalog_id', unique=True) #get dictionary for finding the path to FITS files and WCS headers with open('%s/first_fits.txt' % rgz_path) as f: lines = f.readlines() pathdict = {} for l in lines: spl = l.split(' ') pathdict[spl[1].strip()] = '%s/rgz/raw_images/RGZ-full.%i/FIRST-IMGS/%s.fits' % (data_path, int(spl[0]), spl[1].strip()) #count the number of entries from this run and how many entries are in the catalog total count = 0 if catalog.count() != 0: for entry in catalog.find().sort('catalog_id', -1).limit(1): IDnumber = entry['catalog_id'] else: IDnumber = 0 #find completed catalog entries so they can be skipped consensus_set = set() for source in consensus.find(): consensus_set.add(source['zooniverse_id']) catalog_set = set() for entry in catalog.find(): catalog_set.add(entry['zooniverse_id']) to_be_completed = consensus_set.difference(catalog_set) if os.path.exists(in_progress_file): with open(in_progress_file, 'r') as f: in_progress_zid = f.read() to_be_completed = to_be_completed.union(in_progress_zid) to_be_completed = list(to_be_completed) #iterate through all noncompleted subjects for subject in subjects.find({'zooniverse_id': {'$in':to_be_completed} }).batch_size(10): #for subject in subjects.find({'zooniverse_id': {'$in': ['ARG00000sl', 'ARG0003f9l']} }): #for subject in subjects.find({'zooniverse_id':'ARG00000sl'}): #sample subject with distinct sources #for subject in subjects.find({'zooniverse_id':'ARG0003f9l'}): #sample subject with multiple-component source #mark subject as being in-progress with open(in_progress_file, 'w') as f: f.write(subject['zooniverse_id']) #iterate through all consensus groupings for source in consensus.find({'zooniverse_id':subject['zooniverse_id'], 'first_id':{'$exists':True}}): #do not process if this object in this source is already in the catalog process = True for i in catalog.find({'zooniverse_id':subject['zooniverse_id']}): if i['consensus']['label'] == source['label']: process = False if process: logging.info('Processing consensus object %s within subject field %s', source['label'], subject['zooniverse_id']) count += 1 IDnumber += 1 #display which entry is being processed to see how far the program is print 'Processing entry %i (consensus %s in subject %s)' % (IDnumber, source['label'], subject['zooniverse_id']) entry = {'catalog_id':IDnumber, 'zooniverse_id':str(subject['zooniverse_id'])} #find location of FITS file; once non-FIRST sources are included, modify this fid = source['first_id'] #if fid[0] == 'F': fits_loc = pathdict[fid] entry.update({'first_id':str(fid)}) #else: # raise RuntimeError('Not expecting non-FIRST data') # fits_loc = '%s/rgz/raw_images/ATLAS/2x2/%s_radio.fits' % (data_path, fid) # entry.update({'atlas_id':str(fid)}) #find IR counterpart from consensus data, if present w = wcs.WCS(fits.getheader(fits_loc, 0)) #gets pixel-to-WCS conversion from header ir_coords = source['ir_peak'] if ir_coords[0] == -99: ir_pos = None wise_match = None sdss_match = None else: #this only works for FIRST images; will need changing when ATLAS is added p2w = w.wcs_pix2world ir_ra_pixels = ir_coords[0]*w._naxis1/500. ir_dec_pixels = 1 + w._naxis2 - ir_coords[1]*w._naxis2/500. ir_peak = p2w( np.array([[ir_ra_pixels, ir_dec_pixels]]), 1) ir_pos = coord.SkyCoord(ir_peak[0][0], ir_peak[0][1], unit=(u.deg,u.deg), frame='icrs') entry.update({'consensus':{'n_radio':source['n_votes'], 'n_total':source['n_total'], 'n_ir':source['n_ir'], 'ir_flag':source['ir_flag'], \ 'ir_level':source['ir_level'], 'radio_level':source['consensus_level'], 'label':source['label']}}) if ir_pos: logging.info('IR counterpart found') entry['consensus'].update({'ir_ra':ir_pos.ra.deg, 'ir_dec':ir_pos.dec.deg}) else: logging.info('No IR counterpart found') #if an IR peak exists, search AllWISE and SDSS for counterparts if ir_pos: wise_match = p.getWISE(entry) if wise_match: designation = wise_match['designation'][5:] pz = db['wise_pz'].find_one({'wiseX':designation}) if pz is not None: wise_match['photo_redshift'] = pz['zPhoto_Corr'] entry.update({'AllWISE':wise_match}) '''tryCount = 0 while(True): tryCount += 1 try: sdss_match = p.getSDSS(entry) if sdss_match: entry.update({'SDSS':sdss_match}) break except KeyError as e: if tryCount>5: output('Bad response from SkyServer; trying again in 10 min', logging.exception) raise fn.DataAccessError(message) elif e.message == 'ra': #unable to reproduce; no error when I try again, so let's just do that logging.exception(e) time.sleep(10) else: raise e''' sdss_match = None #try block attempts to read JSON from web; if it exists, calculate data try: link = subject['location']['contours'] #gets url as Unicode string # Use local file if available jsonfile = link.split("/")[-1] jsonfile_path = "{0}/rgz/contours/{1}".format(data_path,jsonfile) if os.path.exists(jsonfile_path): with open(jsonfile_path,'r') as jf: data = json.load(jf) # Otherwise, read from web else: # Reform weblink to point to the direct S3 URL, which will work even with older SSLv3 link_s3 = "http://zooniverse-static.s3.amazonaws.com/"+link.split('http://')[-1] tryCount = 0 while(True): #in case of error, wait 10 sec and try again; give up after 5 tries tryCount += 1 try: compressed = urllib2.urlopen(str(link_s3)).read() #reads contents of url to str break except (urllib2.URLError, urllib2.HTTPError) as e: if tryCount>5: output('Unable to connect to Amazon Web Services; trying again in 10 min', logging.exception) raise fn.DataAccessError(message) logging.exception(e) time.sleep(10) tempfile = StringIO.StringIO(compressed) #temporarily stores contents as file (emptied after unzipping) uncompressed = gzip.GzipFile(fileobj=tempfile, mode='r').read() #unzips contents to str data = json.loads(uncompressed) #loads JSON object radio_data = p.getRadio(data, fits_loc, source) entry.update(radio_data) #check if a component is straddling the edge of the image entry.update({'overedge':0}) source_bbox = np.array(source['bbox']) for c in data['contours']: bbox = np.array(c[0]['bbox']) if bbox in source_bbox: vertices = [] for pos in c[0]['arr']: vertices.append([pos['x'], pos['y']]) vertices = np.array(vertices) diff = vertices[0] - vertices[-1] if np.sqrt(diff[0]**2 + diff[1]**2) > 1 and (np.any(vertices[0] <= 4) or np.any(vertices[0] >= 128)): entry.update({'overedge':1}) break #use WISE catalog name if available if wise_match: entry.update({'rgz_name':'RGZ{}{}'.format(wise_match['designation'][5:14], wise_match['designation'][15:22])}) else: #if not, try consensus IR position if ir_pos: ra = ir_pos.ra.deg dec = ir_pos.dec.deg #finally, just use radio center else: ra = radio_data['radio']['ra'] dec = radio_data['radio']['dec'] ra_h = int(ra/15.) ra_m = int((ra - ra_h*15)*4) ra_s = (ra - ra_h*15 - ra_m/4.)*240 dec_d = int(dec) dec_m = int((dec - dec_d)*60) dec_s = int((dec - dec_d - dec_m/60.)*3600) entry.update({'rgz_name':'RGZJ{:0=2}{:0=2}{:0=4.1f}{:0=+3}{:0=2}{:0=2}'.format(ra_h, ra_m, ra_s, dec_d, dec_m, dec_s)}) #calculate physical parameters using redshift from SDSS if sdss_match: z = 0 if 'spec_redshift' in sdss_match: z = sdss_match['spec_redshift'] elif 'photo_redshift' in sdss_match: z = sdss_match['photo_redshift'] if z>0: physical = p.getPhysical(z, radio_data) entry['radio'].update(physical) logging.info('Radio data added') #if the link doesn't have a JSON, no data can be determined except urllib2.HTTPError as e: if e.code == 404: logging.info('No radio JSON detected') else: logging.exception(e) raise catalog.insert(entry) find_duplicates(entry['zooniverse_id']) logging.info('Entry %i added to catalog', IDnumber) with open(in_progress_file, 'w') as f: f.write('') #end timer endtime = time.time() output('Time taken: %f' % (endtime-starttime)) return count