Python readGenericDataFile Examples, gkutils.commonutils.readGenericDataFile Python Examples

Example #1

0

Show file

def main(argv = None):
    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)

    # Use utils.Struct to convert the dict into an object for compatibility with old optparse code.
    options = Struct(**opts)

    fkDict = {}
    # If we have a foreign key table, read the data once only.  Pass this to the subprocesses.
    if options.fktable:
        fkeys = readGenericDataFile(options.fktable, delimiter='\t')
        for row in fkeys:
            fkDict[row[options.fkfield]] = row

    ingestDataMultiprocess(options, fkDict = fkDict)

Example #2

0

Show file

File: ingestGenericDatabaseTable.py Project: genghisken/gkdbutils

def ingestData(options, inputFiles):
    generateHtmidBulk = which('generate_htmid_bulk')
    if generateHtmidBulk is None:
        sys.stderr.write(
            "Can't find the generate_htmid_bulk executable, so cannot continue.\n"
        )
        exit(1)

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.load(yaml_file)

    username = config['databases']['local']['username']
    password = config['databases']['local']['password']
    database = config['databases']['local']['database']
    hostname = config['databases']['local']['hostname']

    db = {
        'username': username,
        'password': password,
        'database': database,
        'hostname': hostname
    }

    currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    for inputFile in inputFiles:
        print("Ingesting %s" % inputFile)
        if 'gz' in inputFile:
            # It's probably gzipped
            f = gzip.open(inputFile, 'rb')
            print(type(f).__name__)
        else:
            f = inputFile

        data = readGenericDataFile(f, delimiter=',', useOrderedDict=True)
        pid = os.getpid()

        tempRADecFile = '/tmp/' + os.path.basename(inputFile) + 'radec_' + str(
            pid)
        tempLoadFile = '/tmp/' + os.path.basename(inputFile) + '_' + str(
            pid) + '.csv'

        with open(tempRADecFile, 'wb') as f:
            for row in data:
                f.write('%s %s\n' % (row['ra'], row['dec']))

        htm10IDs = calculate_htm_ids_bulk(generateHtmidBulk, 10, tempRADecFile)
        htm13IDs = calculate_htm_ids_bulk(generateHtmidBulk, 13, tempRADecFile)
        htm16IDs = calculate_htm_ids_bulk(generateHtmidBulk, 16, tempRADecFile)

        os.remove(tempRADecFile)

        for i in range(len(data)):
            # Add the HTM IDs to the data
            data[i]['htm10ID'] = htm10IDs[i]
            data[i]['htm13ID'] = htm13IDs[i]
            data[i]['htm16ID'] = htm16IDs[i]

        nprocesses = int(options.nprocesses)

        if len(data) > 0:
            nProcessors, listChunks = splitList(data,
                                                bins=nprocesses,
                                                preserveOrder=True)

            print("%s Parallel Processing..." %
                  (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            workerInsert,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

Example #3

0

Show file

def main(argv = None):
    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)

    # Use utils.Struct to convert the dict into an object for compatibility with old optparse code.
    options = Struct(**opts)

    atlasCentres = readGenericDataFile(options.atlasCentresFile, delimiter='\t')
    atlasRowLen = len(atlasCentres[0].keys())
    inputCoords = readGenericDataFile(options.inputCoordsFile, delimiter=',')

    radius = 3.86
    try:
        radius = float(options.searchradius)

    except ValueError as e:
        pass

    if options.footprints:
        for row in inputCoords:
            if options.debug:
                print(row)
            try:
                ra = float(row['ra'])
                dec = float(row['dec'])
            except ValueError as e:
                ra, dec = coords_sex_to_dec(row['ra'], row['dec'])

            for r in atlasCentres:
                if isObjectInsideATLASFootprint(ra, dec, float(r['ra']), float(r['dec'])):
                    if options.checkmjd:
                        if abs(float(r['mjd']) - float(row['mjd'])) < float(options.mjdtolerance):
                            matches = doRegexMatch(r['expname'])
                            if matches:
                                red = ''
                                if options.red:
                                    red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + r['expname'] + '.fits.fz'
                                    print(row['name'], red)
                                else:
                                    print(row['name'], r['expname'])
                            else:
                                print(row['name'], r['expname'])

                    else:
                        matches = doRegexMatch(r['expname'])
                        if matches:
                            red = ''
                            if options.red:
                                red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + r['expname'] + '.fits.fz'
                                print(row['name'], red)
                            else:
                                print(row['name'], r['expname'])
                        else:
                            print(row['name'], r['expname'])

    else:
        for row in inputCoords:
            if options.debug:
                print(row)
            try:
                ra = float(row['ra'])
                dec = float(row['dec'])
            except ValueError as e:
                ra, dec = coords_sex_to_dec(row['ra'], row['dec'])

            header, results = bruteForceGenericConeSearch(options.atlasCentresFile, [[ra, dec]], radius*3600.0, raIndex = 'ra', decIndex = 'dec')
            for r in results:
                if options.checkmjd:
                    exps = r.split()
                    if abs(float(exps[3]) - float(row['mjd'])) < float(options.mjdtolerance):
                        matches = doRegexMatch(exps[0])
                        if matches:
                            red = ''
                            if options.red:
                                red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + exps[0] + '.fits.fz'
                                print (row['name'], red, "%.2f" % (float(exps[atlasRowLen+1])/3600.0))
                            else:
                                print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0))
                        else:
                            print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0))
                else:
                    exps = r.split()
                    matches = doRegexMatch(exps[0])
                    if matches:
                        red = ''
                        if options.red:
                            red = '/atlas/red/' + matches['camera'] + '/' + matches['mjd'] + '/' + exps[0] + '.fits.fz'
                            print (row['name'], red, "%.2f" % (float(exps[atlasRowLen+1])/3600.0))
                        else:
                            print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0))
                    else:
                        print (row['name'], exps[0], "%.2f" % (float(exps[atlasRowLen+1])/3600.0))

Example #4

0

Show file

 def setup(options):
     inputData = readGenericDataFile(options.filename)
     return inputData

Example #5

0

Show file

def ingestData(options, inputFiles, fkDict = None):

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.safe_load(yaml_file)

    username = config['cassandra']['local']['username']
    password = config['cassandra']['local']['password']
    keyspace = config['cassandra']['local']['keyspace']
    hostname = config['cassandra']['local']['hostname']

    db = {'username': username,
          'password': password,
          'keyspace': keyspace,
          'hostname': hostname}

    currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    delimiter=options.tableDelimiter
    if delimiter == '\\s':
        delimiter = ' '
    if delimiter == '\\t':
        delimiter = '\t'

    for inputFile in inputFiles:
        print("Ingesting %s" % inputFile)
        if '.gz' in inputFile:
            # It's probably gzipped
            f = gzip.open(inputFile, 'rb')
            print(type(f).__name__)
        else:
            f = inputFile
    
        if 'avro' in inputFile:
            # Data is in Avro packets, with schema. Let's hard-wire to the ZTF schema for the time being.
            avroData = readZTFAvroPacket(f, addhtm16 = True)
            if 'noncandidates' in options.table:
                data = avroData['noncandidates']
            elif 'candidates' in options.table:
                data = avroData['candidates']
            else:
                print("Error. Incorrect table definition for Avro packets. Must contain candidates or noncandidates.")
                exit(1)

        else:
            # Data is in plain text file. No schema present, so will need to provide
            # column types.
            data = readGenericDataFile(f, delimiter=delimiter, useOrderedDict=True)

        # 2021-07-29 KWS This is a bit inefficient, but trim the data down to specified columns if they are present.
        if options.columns:
            trimmedData = []
            for row in data:
                trimmedRow = {key: row[key] for key in options.columns.split(',')}
                trimmedData.append(trimmedRow)
            data = trimmedData


        foreignKey = options.fkfrominputdata
        if foreignKey == 'filename':
            foreignKey = os.path.basename(inputFile).split('.')[0]


        if fkDict:
            for i in range(len(data)):
                try:
                    if options.fktablecols:
                        # just pick out the specified keys
                        keys = options.fktablecols.split(',')
                        for k in keys:
                            data[i][k] = fkDict[foreignKey][k]
                    else:
                        # Use all the keys by default
                        for k,v in fkDict[foreignKey].items():
                            data[i][k] = v
                except KeyError as e:
                    pass

        #print(data[0])
        pid = os.getpid()
    
        if not options.skiphtm:
    
            coords = []
            for row in data:
                coords.append([float(row[options.racol]), float(row[options.deccol])])
    
            htm16Names = htmNameBulk(16, coords)

            # For Cassandra, we're going to split the HTM Name across several columns.
            # Furthermore, we only need to do this once for the deepest HTM level, because
            # This is always a subset of the higher levels.  Hence we only need to store
            # the tail end of the HTM name in the actual HTM 16 column.  So...  we store
            # the full HTM10 name as the first 12 characters of the HTM 16 one, then the
            # next 3 characters into the HTM 13 column, then the next 3 characters (i.e.
            # the last few characters) the HTM 16 column
            # e.g.:
            # ra, dec =      288.70392, 9.99498
            # HTM 10  = N02323033011
            # HTM 13  = N02323033011 211
            # HTM 16  = N02323033011 211 311

            # Incidentally, this hierarchy also works in binary and we should seriously
            # reconsider how we are currently using HTMs.

            # HTM10 ID =    13349829 = 11 00 10 11 10 11 00 11 11 00 01 01
            # HTM13 ID =   854389093 = 11 00 10 11 10 11 00 11 11 00 01 01  10 01 01
            # HTM16 ID = 54680902005 = 11 00 10 11 10 11 00 11 11 00 01 01  10 01 01  11 01 01


            for i in range(len(data)):
                # Add the HTM IDs to the data
                data[i]['htm10'] = htm16Names[i][0:12]
                data[i]['htm13'] = htm16Names[i][12:15]
                data[i]['htm16'] = htm16Names[i][15:18]
    
    
        nprocesses = int(options.nprocesses)
    
        if len(data) > 0:
            nProcessors, listChunks = splitList(data, bins = nprocesses, preserveOrder=True)
    
            print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db, dateAndTime, nProcessors, listChunks, workerInsert, miscParameters = [options], drainQueues = False)
            print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

Example #6

0

Show file

File: getATLASForcedPhotometry.py Project: genghisken/psat-server

def doForcedPhotometry(options, objectList, perObjectExps):
    """doForcedPhotometry.

    Args:
        objectList:
        perObjectExps:
    """
    fphot = []
    for candidate in objectList:
        #print candidate['id'], perObjectExps[candidate['id']]['avgRa'], perObjectExps[candidate['id']]['avgDec']

        print("Running forced photometry for candidate", candidate['id'])

        #tphorce [options] <diffPath> <tphotOutputPath> <raDeg> <decDeg> <snrLimit>
        for exp in perObjectExps[candidate['id']]['exps']:
            camera = exp[0:3]
            mjd = exp[3:8]

            aux = ''
            imageName = ATLAS_ROOT + '/diff/' + camera + '/' + mjd + '/' + exp + '.diff.fz'

            #            if camera == '02a' and int(mjd) <= 57771:
            #                camera = '02a.ORIG'

            if int(mjd) >= 57350:
                aux = 'AUX/'
            tphName = ATLAS_ROOT + '/red/' + camera + '/' + mjd + '/' + aux + exp + '.tph'

            #print TPHORCE, imageName, tphName, perObjectExps[candidate['id']]['avgRa'], perObjectExps[candidate['id']]['avgDec'], '3.0'
            # 2022-07-21 KWS Added text=True to the Popen command. Ensures that the response comes back as text.
            p = subprocess.Popen([
                options.tphorce, imageName, tphName,
                str(perObjectExps[candidate['id']]['avgRa']),
                str(perObjectExps[candidate['id']]['avgDec']),
                str(FORCED_PHOT_DET_SNR),
                str(FORCED_PHOT_LIMIT_SNR)
            ],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 text=True)
            output, errors = p.communicate()

            if output.strip():
                csvData = readGenericDataFile(io.StringIO(output),
                                              delimiter=',')
                # There should only be one CSV row
                data = None
                try:
                    data = csvData[0]
                except IndexError as e:
                    print("ERROR: This is not a CSV file. Output = %s" %
                          str(output).strip())

                if data:
                    data['limiting_mag'] = False

                    # {'mjd': '57604.56804193', 'magerr': 'NaN', 'filter': 'c', 'expname': '02a57604o0375c', 'snr': 'NaN', 'mag': 'NaN', 'zp': '22.160', 'id': 1012602110150218500L}

                    if data['mag'] and '>' in data['mag']:
                        data['mag'] = data['mag'].replace('>', '')
                        data['limiting_mag'] = True

                    data['id'] = candidate['id']
                    data['expname'] = exp
                    data['ra'] = perObjectExps[candidate['id']]['avgRa']
                    data['dec'] = perObjectExps[candidate['id']]['avgDec']
                    data['snrdet'] = FORCED_PHOT_DET_SNR
                    data['snrlimit'] = FORCED_PHOT_LIMIT_SNR

                    # Clean up the data - replace 'NaN' with None
                    for k, v in data.items():
                        if v == 'NaN':
                            data[k] = None
                    fphot.append(data)

            if errors.strip():
                print(errors)

    return fphot

Example #7

0

Show file

File: coneSearchCassandra.py Project: genghisken/gkutils

def main(argv=None):
    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)

    # Use utils.Struct to convert the dict into an object for compatibility with old optparse code.
    options = Struct(**opts)

    #keyspace = 'atlas'
    #host = ['db0', 'db1', 'db2', 'db3', 'db4']

    # random star
    #ra = 83.20546
    #dec = -20.70055

    # ATLAS17nij
    #ra = 82.46704
    #dec = -19.52058

    # ATLAS20biio
    #ra = 83.24691
    #dec = -19.11739

    # ATLAS20bbio - very good!!
    #ra = 81.27903
    #dec = -21.24643

    # ATLAS18vre
    #ra = 84.19551
    #dec = -22.41100

    # ATLAS19bdbm
    #ra = 85.10436
    #dec = -18.09766

    # ATLAS20bbff
    #ra = 86.52075
    #dec = -23.56601

    # ATLAS20ymv - THIS IS the CENTRE OBJECT. We did a 10 degree sweep around this.
    #ra = 74.55677
    #dec = -20.35753

    # ATLAS17lvn - bright foreground star
    #ra = 68.75953
    #dec = -14.22797

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.safe_load(yaml_file)

    username = config['cassandra']['local']['username']
    password = config['cassandra']['local']['password']
    keyspace = config['cassandra']['local']['keyspace']
    hostname = config['cassandra']['local']['hostname']

    db = {
        'username': username,
        'password': password,
        'keyspace': keyspace,
        'hostname': hostname
    }

    coordslist = []

    if options.coordsfromfile:
        coordslist = readGenericDataFile(options.coords, delimiter=',')
    else:
        coordslist.append({
            'ra': options.coords.split(',')[0],
            'dec': options.coords.split(',')[1]
        })

    if options.number and int(options.number) < len(coordslist):
        coordslist = random.sample(coordslist, int(options.number))

    if int(options.nprocesses) > 1 and len(coordslist) > 1:
        # Do it in parallel!
        currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
        (year, month, day, hour, min, sec) = currentDate.split(':')
        dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)
        nProcessors, listChunks = splitList(coordslist,
                                            bins=int(options.nprocesses),
                                            preserveOrder=True)

        print("%s Parallel Processing..." %
              (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        parallelProcess(db,
                        dateAndTime,
                        nProcessors,
                        listChunks,
                        worker,
                        miscParameters=[options],
                        drainQueues=False)
        print("%s Done Parallel Processing" %
              (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
    else:
        cluster = Cluster(db['hostname'])
        session = cluster.connect()
        session.row_factory = dict_factory
        session.set_keyspace(db['keyspace'])

        getLCData(options, session, coordslist)

        cluster.shutdown()