def test_tns_watcher(self): log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=True, ) log("Successfully connected") collection = config["database"]["collections"]["tns"] log("Grabbing 1 page with 5 entries from the TNS and ingesting that into the database" ) get_tns( grab_all=False, num_pages=1, entries_per_page=5, ) log("Done") fetched_entries = list(mongo.db[collection].find({}, {"_id": 1})) assert len(fetched_entries) > 0
def mongo_fixture(request): log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=True, ) log("Successfully connected") request.cls.mongo = mongo
def run( path: str = "./", num_processes: int = 1, batch_size: int = 2048, ): """Pre-process and ingest IGAPS catalog :param path: path to CSV data files (~98 GB tarred) see http://www.star.ucl.ac.uk/IGAPS/catalogue/ :param num_processes: :return: """ files = glob.glob(os.path.join(path, "igaps-*.fits.gz")) catalog_name = "IGAPS_DR2" log("Connecting to DB") m = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], ) log("Successfully connected") # Create indexes in the database: log("Creating indexes") # 2D position on the sphere, ID: m.db[catalog_name].create_index([("coordinates.radec_geojson", "2dsphere"), ("_id", 1)], background=True) input_list = [(f, catalog_name, batch_size) for f in files] with mp.Pool(processes=num_processes) as p: for _ in tqdm(p.istarmap(process_file, input_list), total=len(input_list)): pass
def process_file(file, collection, batch_size): # connect to MongoDB: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) log("Successfully connected") collection = "VLASS_DR1" log(f"Processing {file}") names = [ "Component_name", "RA", "DEC", "E_RA", "E_DEC", "Total_flux", "E_Total_flux", "Peak_flux", "E_Peak_flux", "Maj", "E_Maj", "Min", "E_Min", "Duplicate_flag", "Quality_flag", ] for chunk_index, dataframe_chunk in enumerate( pd.read_csv(file, chunksize=batch_size) ): log(f"{file}: processing batch # {chunk_index + 1}") dataframe_chunk = dataframe_chunk[names] dataframe_chunk = dataframe_chunk[dataframe_chunk["Duplicate_flag"] < 2] dataframe_chunk = dataframe_chunk[dataframe_chunk["Quality_flag"] == 0] batch = dataframe_chunk.to_dict(orient="records") bad_document_indexes = [] for document_index, document in enumerate(batch): try: # GeoJSON for 2D indexing document["coordinates"] = dict() # string format: H:M:S, D:M:S document["coordinates"]["radec_str"] = [ deg2hms(document["RA"]), deg2dms(document["DEC"]), ] # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [document["RA"] - 180.0, document["DEC"]] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } except Exception as e: log(str(e)) bad_document_indexes.append(document_index) if len(bad_document_indexes) > 0: log("Removing bad docs") for index in sorted(bad_document_indexes, reverse=True): del batch[index] # ingest batch mongo.insert_many(collection=collection, documents=batch) # disconnect from db: try: mongo.client.close() finally: log("Successfully disconnected from db")
def get_ops(): """ Fetch and ingest ZTF ops data """ # connect to MongoDB: print(f'{time_stamp()}: Connecting to DB.') mongo = Mongo(host=config['database']['host'], port=config['database']['port'], username=config['database']['username'], password=config['database']['password'], db=config['database']['db'], verbose=0) print(f'{time_stamp()}: Successfully connected.') collection = 'ZTF_ops' print(f'{time_stamp()}: Checking indexes.') mongo.db[collection].create_index( [('coordinates.radec_geojson', '2dsphere')], background=True) mongo.db[collection].create_index([('utc_start', pymongo.ASCENDING), ('utc_end', pymongo.ASCENDING), ('fileroot', pymongo.ASCENDING)], background=True) mongo.db[collection].create_index([('jd_start', pymongo.ASCENDING), ('jd_end', pymongo.ASCENDING), ('fileroot', pymongo.ASCENDING)], background=True) mongo.db[collection].create_index([('jd_start', pymongo.DESCENDING), ('pid', pymongo.ASCENDING), ('field', pymongo.ASCENDING)], background=True) # fetch full table print(f'{time_stamp()}: Fetching data.') url = config['ztf_ops']['url'] r = requests.get(url, auth=(config['ztf_ops']['username'], config['ztf_ops']['password']), verify=False) if r.status_code == requests.codes.ok: with open(os.path.join(config['path']['tmp'], 'allexp.tbl'), 'wb') as f: f.write(r.content) else: raise Exception(f'{time_stamp()}: Failed to fetch allexp.tbl') latest = list(mongo.db[collection].find({}, sort=[["$natural", -1]], limit=1)) print(f'{time_stamp()}: Loading data.') df = pd.read_fwf(os.path.join(config['path']['tmp'], 'allexp.tbl'), comment='|', header=None, names=[ 'utc_start', 'sun_elevation', 'exp', 'filter', 'type', 'field', 'pid', 'ra', 'dec', 'slew', 'wait', 'fileroot', 'programpi', 'qcomment' ]) # drop comments: comments = df['utc_start'] == 'UT_START' df = df.loc[~comments] for col in ['sun_elevation', 'exp', 'filter', 'field', 'pid']: df[col] = df[col].apply(lambda x: int(x)) for col in ['ra', 'dec', 'slew', 'wait']: df[col] = df[col].apply(lambda x: float(x)) df['utc_start'] = df['utc_start'].apply( lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')) df['utc_end'] = df['utc_start'].add( df['exp'].apply(lambda x: datetime.timedelta(seconds=x))) df['jd_start'] = df['utc_start'].apply(lambda x: datetime_to_jd(x)) df['jd_end'] = df['utc_end'].apply(lambda x: datetime_to_jd(x)) # drop rows with utc_start <= c['utc_start] if len(latest) > 0: new = df['jd_start'] > latest[0].get('jd_start', 0) if sum(new): print(f'{time_stamp()}: Found {sum(new)} new records.') df = df.loc[new] else: # no new data? take a nap... print(f'{time_stamp()}: No new data found.') # close connection to db mongo.client.close() print(f'{time_stamp()}: Disconnected from db.') return documents = df.to_dict('records') documents = [mongify(doc) for doc in documents] print(f'{time_stamp()}: Inserting {len(documents)} documents.') mongo.insert_many(collection=collection, documents=documents) # close connection to db mongo.client.close() print(f'{time_stamp()}: Disconnected from db.')
def process_file(file, collection, batch_size): # connect to MongoDB: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) log("Successfully connected") collection = "IGAPS_DR2" log(f"Processing {file}") names = [ "name", "RA", "DEC", "gal_long", "gal_lat", "sourceID", "posErr", "mergedClass", "pStar", "pGalaxy", "pNoise", "i", "iErr", "iAB", "iEll", "iClass", "iDeblend", "iSaturated", "iVignetted", "iTrail", "iTruncated", "iBadPix", "iMJD", "iSeeing", "iDetectionID", "iDeltaRA", "iDeltaDEC", "ha", "haErr", "haAB", "haEll", "haClass", "haDeblend", "haSaturated", "haVignetted", "haTrail", "haTruncated", "haBadPix", "haMJD", "haSeeing", "haDetectionID", "haDeltaRA", "haDeltaDEC", "r_I", "rErr_I", "rAB_I", "rEll_I", "rClass_I", "rDeblend_I", "rSaturated_I", "rVignetted_I", "rTrail_I", "rTruncated_I", "rBadPix_I", "rMJD_I", "rSeeing_I", "rDetectionID_I", "r_U", "rErr_U", "rAB_U", "rEll_U", "rClass_U", "rDeblend_U", "rSaturated_U", "rVignetted_U", "rTrail_U", "rTruncated_U", "rBadPix_U", "rMJD_U", "rSeeing_U", "rDetectionID_U", "rDeltaRA_U", "rDeltaDEC_U", "g", "gErr", "gAB", "gEll", "gClass", "gDeblend", "gSaturated", "gVignetted", "gTrail", "gTruncated", "gBadPix", "gmask", "gMJD", "gSeeing", "gDetectionID", "gDeltaRA", "gDeltaDEC", "U_RGO", "UErr", "UEll", "UClass", "UDeblend", "USaturated", "UVignetted", "UTrail", "UTruncated", "UBadPix", "UMJD", "USeeing", "UDetectionID", "UDeltaRA", "UDeltaDEC", "brightNeighb", "deblend", "saturated", "nBands", "errBits", "nObs_I", "nObs_U", "fieldID_I", "fieldID_U", "fieldGrade_I", "fieldGrade_U", "emitter", "variable", "SourceID2", "i2", "i2Err", "i2Class", "i2Seeing", "i2MJD", "i2DeltaRA", "i2DeltaDEC", "i2DetectionID", "i2ErrBits", "ha2", "ha2Err", "ha2Class", "ha2Seeing", "ha2MJD", "ha2DeltaRA", "ha2DeltaDEC", "ha2DetectionID", "ha2ErrBits", "r2_I", "r2Err_I", "r2Class_I", "r2Seeing_I", "r2MJD_I", "r2DeltaRA_I", "r2DeltaDEC_I", "r2DetectionID_I", "r2ErrBits_I", "r2_U", "r2Err_U", "r2Class_U", "r2Seeing_U", "r2MJD_U", "r2DeltaRA_U", "r2DeltaDEC_U", "r2DetectionID_U", "r2ErrBits_U", "g2", "g2Err", "g2Class", "g2Seeing", "g2MJD", "g2DeltaRA", "g2DeltaDEC", "g2DetectionID", "g2ErrBits", "U_RGO2", "U2Err", "U2Class", "U2Seeing", "U2MJD", "U2DeltaRA", "U2DeltaDEC", "U2DetectionID", "U2ErrBits", "errBits2", ] with fits.open(file) as hdulist: nhdu = 1 dataframe = pd.DataFrame(np.asarray(hdulist[nhdu].data), columns=names) for chunk_index, dataframe_chunk in dataframe.groupby( np.arange(len(dataframe)) // batch_size): log(f"{file}: processing batch # {chunk_index + 1}") for col, dtype in dataframe_chunk.dtypes.items(): if dtype == np.object: dataframe_chunk[col] = dataframe_chunk[col].apply( lambda x: x.decode("utf-8")) batch = dataframe_chunk.to_dict(orient="records") batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict( orient="records") # pop nulls - save space batch = [{ key: value for key, value in document.items() if value != "DROPMEPLEASE" } for document in batch] bad_document_indexes = [] for document_index, document in enumerate(batch): try: # GeoJSON for 2D indexing document["coordinates"] = dict() # string format: H:M:S, D:M:S document["coordinates"]["radec_str"] = [ deg2hms(document["RA"]), deg2dms(document["DEC"]), ] # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [document["RA"] - 180.0, document["DEC"]] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } except Exception as e: log(str(e)) bad_document_indexes.append(document_index) if len(bad_document_indexes) > 0: log("Removing bad docs") for index in sorted(bad_document_indexes, reverse=True): del batch[index] # ingest batch mongo.insert_many(collection=collection, documents=batch) # disconnect from db: try: mongo.client.close() finally: log("Successfully disconnected from db")
def get_tns(grab_all: bool = False, num_pages: int = 10, entries_per_page: int = 100): """ Queries the TNS and obtains the sources reported to it. :param grab_all: grab the complete database from TNS? takes a while! :param num_pages: grab the last <num_pages> pages :param entries_per_page: number of entries per page to grab :return: """ # connect to MongoDB: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) log("Successfully connected") collection = config["database"]["collections"]["tns"] if config["database"]["build_indexes"]: log("Checking indexes") for index in config["database"]["indexes"][collection]: try: ind = [tuple(ii) for ii in index["fields"]] mongo.db[collection].create_index( keys=ind, name=index["name"], background=True, unique=index["unique"], ) except Exception as e: log(e) log("Fetching data...") if grab_all: # grab the latest data (5 is the minimum): url = os.path.join(config["tns"]["url"], "search?format=csv&num_page=5&page=0") data = pd.read_csv(url) num_pages = data["ID"].max() // entries_per_page for num_page in range(num_pages): log(f"Digesting page #{num_page+1} of {num_pages}...") url = os.path.join( config["tns"]["url"], f"search?format=csv&num_page={entries_per_page}&page={num_page}", ) # 20210114: wis-tns.org has issues with their certificate csv_data = requests.get(url, allow_redirects=False, timeout=60).content data = pd.read_csv(io.StringIO(csv_data.decode("utf-8"))) for index, row in data.iterrows(): try: doc = mongify(row) doc_id = doc.pop("_id", None) if doc_id: mongo.update_one( collection=collection, filt={"_id": doc_id}, update={"$set": doc}, upsert=True, ) except Exception as e: log(str(e)) log(traceback.print_exc()) # close connection to db mongo.client.close() log("Disconnected from db")
def test_ingester(self): init_db_sync(config=config, verbose=True) log("Setting up paths") # path_kafka = pathlib.Path(config["path"]["kafka"]) path_logs = pathlib.Path(config["path"]["logs"]) if not path_logs.exists(): path_logs.mkdir(parents=True, exist_ok=True) if config["misc"]["broker"]: log("Setting up test groups and filters in Fritz") program = Program(group_name="FRITZ_TEST", group_nickname="test") Filter( collection="ZTF_alerts", group_id=program.group_id, filter_id=program.filter_id, ) program2 = Program(group_name="FRITZ_TEST_AUTOSAVE", group_nickname="test2") Filter( collection="ZTF_alerts", group_id=program2.group_id, filter_id=program2.filter_id, autosave=True, pipeline=[{"$match": {"objectId": "ZTF20aaelulu"}}], ) program3 = Program( group_name="FRITZ_TEST_UPDATE_ANNOTATIONS", group_nickname="test3" ) Filter( collection="ZTF_alerts", group_id=program3.group_id, filter_id=program3.filter_id, update_annotations=True, pipeline=[ {"$match": {"objectId": "ZTF20aapcmur"}} ], # there are 3 alerts in the test set for this oid ) # clean up old Kafka logs log("Cleaning up Kafka logs") subprocess.run(["rm", "-rf", path_logs / "kafka-logs", "/tmp/zookeeper"]) log("Starting up ZooKeeper at localhost:2181") # start ZooKeeper in the background cmd_zookeeper = [ os.path.join(config["path"]["kafka"], "bin", "zookeeper-server-start.sh"), "-daemon", os.path.join(config["path"]["kafka"], "config", "zookeeper.properties"), ] with open(path_logs / "zookeeper.stdout", "w") as stdout_zookeeper: # p_zookeeper = subprocess.run( cmd_zookeeper, stdout=stdout_zookeeper, stderr=subprocess.STDOUT ) # take a nap while it fires up time.sleep(3) log("Starting up Kafka Server at localhost:9092") # start the Kafka server: cmd_kafka_server = [ os.path.join(config["path"]["kafka"], "bin", "kafka-server-start.sh"), "-daemon", os.path.join(config["path"]["kafka"], "config", "server.properties"), ] with open( os.path.join(config["path"]["logs"], "kafka_server.stdout"), "w" ) as stdout_kafka_server: # p_kafka_server = subprocess.Popen(cmd_kafka_server, stdout=stdout_kafka_server, stderr=subprocess.STDOUT) # p_kafka_server = subprocess.run(cmd_kafka_server) # take a nap while it fires up time.sleep(3) # get kafka topic names with kafka-topics command cmd_topics = [ os.path.join(config["path"]["kafka"], "bin", "kafka-topics.sh"), "--zookeeper", config["kafka"]["zookeeper.test"], "-list", ] topics = ( subprocess.run(cmd_topics, stdout=subprocess.PIPE) .stdout.decode("utf-8") .split("\n")[:-1] ) log(f"Found topics: {topics}") # create a test ZTF topic for the current UTC date date = datetime.datetime.utcnow().strftime("%Y%m%d") topic_name = f"ztf_{date}_programid1_test" if topic_name in topics: # topic previously created? remove first cmd_remove_topic = [ os.path.join(config["path"]["kafka"], "bin", "kafka-topics.sh"), "--zookeeper", config["kafka"]["zookeeper.test"], "--delete", "--topic", topic_name, ] # print(kafka_cmd) remove_topic = ( subprocess.run(cmd_remove_topic, stdout=subprocess.PIPE) .stdout.decode("utf-8") .split("\n")[:-1] ) log(f"{remove_topic}") log(f"Removed topic: {topic_name}") time.sleep(1) if topic_name not in topics: log(f"Creating topic {topic_name}") cmd_create_topic = [ os.path.join(config["path"]["kafka"], "bin", "kafka-topics.sh"), "--create", "--bootstrap-server", config["kafka"]["bootstrap.test.servers"], "--replication-factor", "1", "--partitions", "1", "--topic", topic_name, ] with open( os.path.join(config["path"]["logs"], "create_topic.stdout"), "w" ) as stdout_create_topic: # p_create_topic = \ subprocess.run( cmd_create_topic, stdout=stdout_create_topic, stderr=subprocess.STDOUT, ) log("Starting up Kafka Producer") # spin up Kafka producer producer = Producer( {"bootstrap.servers": config["kafka"]["bootstrap.test.servers"]} ) # small number of alerts that come with kowalski path_alerts = pathlib.Path("/app/data/ztf_alerts/20200202/") # grab some more alerts from gs://ztf-fritz/sample-public-alerts try: log("Grabbing more alerts from gs://ztf-fritz/sample-public-alerts") r = requests.get("https://www.googleapis.com/storage/v1/b/ztf-fritz/o") aa = r.json()["items"] ids = [pathlib.Path(a["id"]).parent for a in aa if "avro" in a["id"]] except Exception as e: log( "Grabbing alerts from gs://ztf-fritz/sample-public-alerts failed, but it is ok" ) log(f"{e}") ids = [] subprocess.run( [ "gsutil", "-m", "cp", "-n", "gs://ztf-fritz/sample-public-alerts/*.avro", "/app/data/ztf_alerts/20200202/", ] ) log(f"Fetched {len(ids)} alerts from gs://ztf-fritz/sample-public-alerts") # push! for p in path_alerts.glob("*.avro"): with open(str(p), "rb") as data: # Trigger any available delivery report callbacks from previous produce() calls producer.poll(0) log(f"Pushing {p}") # Asynchronously produce a message, the delivery report callback # will be triggered from poll() above, or flush() below, when the message has # been successfully delivered or failed permanently. producer.produce(topic_name, data.read(), callback=delivery_report) # Wait for any outstanding messages to be delivered and delivery report # callbacks to be triggered. producer.flush() log("Starting up Ingester") # digest and ingest watchdog(obs_date=date, test=True) log("Digested and ingested: all done!") # shut down Kafka server and ZooKeeper time.sleep(20) log("Shutting down Kafka Server at localhost:9092") # start the Kafka server: cmd_kafka_server_stop = [ os.path.join(config["path"]["kafka"], "bin", "kafka-server-stop.sh"), os.path.join(config["path"]["kafka"], "config", "server.properties"), ] with open( os.path.join(config["path"]["logs"], "kafka_server.stdout"), "w" ) as stdout_kafka_server: # p_kafka_server_stop = \ subprocess.run( cmd_kafka_server_stop, stdout=stdout_kafka_server, stderr=subprocess.STDOUT, ) log("Shutting down ZooKeeper at localhost:2181") cmd_zookeeper_stop = [ os.path.join(config["path"]["kafka"], "bin", "zookeeper-server-stop.sh"), os.path.join(config["path"]["kafka"], "config", "zookeeper.properties"), ] with open( os.path.join(config["path"]["logs"], "zookeeper.stdout"), "w" ) as stdout_zookeeper: # p_zookeeper_stop = \ subprocess.run( cmd_zookeeper_stop, stdout=stdout_zookeeper, stderr=subprocess.STDOUT ) log("Checking the ZTF alert collection states") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=True, ) collection_alerts = config["database"]["collections"]["alerts_ztf"] collection_alerts_aux = config["database"]["collections"]["alerts_ztf_aux"] n_alerts = mongo.db[collection_alerts].count_documents({}) assert n_alerts == 313 n_alerts_aux = mongo.db[collection_alerts_aux].count_documents({}) assert n_alerts_aux == 145 if config["misc"]["broker"]: log("Checking that posting to SkyPortal succeeded") # check number of candidates that passed the first filter resp = requests.get( program.base_url + f"/api/candidates?groupIDs={program.group_id}", headers=program.headers, timeout=3, ) assert resp.status_code == requests.codes.ok result = resp.json() assert result["status"] == "success" assert "data" in result assert "totalMatches" in result["data"] assert result["data"]["totalMatches"] == 88 # check that the only candidate that passed the second filter (ZTF20aaelulu) got saved as Source resp = requests.get( program2.base_url + f"/api/sources?group_ids={program2.group_id}", headers=program2.headers, timeout=3, ) assert resp.status_code == requests.codes.ok result = resp.json() assert result["status"] == "success" assert "data" in result assert "totalMatches" in result["data"] assert result["data"]["totalMatches"] == 1 assert "sources" in result["data"] assert result["data"]["sources"][0]["id"] == "ZTF20aaelulu"
syuzhet = pd.read_csv("lexicons/syuzhet.csv") syuzhet.drop_duplicates('word', inplace=True) # nrc2 differ from the original nrc lexicon as 'trump', 'don' and 'jhon' keywords has been removed nrc = pd.read_csv("lexicons/nrc2.csv", header=0, names=[ u'word', u'anger', u'anticipation', u'disgust', u'fear', u'joy', u'negative', u'positive', u'sadness', u'surprise', u'trust' ]) # use modified lexicon nrc.drop_duplicates('word', inplace=True) nrc['value'] = nrc['positive'] - nrc['negative'] # Load data from Mongo mongo = Mongo('facebook', 'comments') docs = [doc for doc in mongo.collection.find()] mongo.close() mongo_ids = [doc.pop('_id', None) for doc in docs] # exclude mongo generated ids docs = d_to_df(docs) docs['created_time'] = pd.to_datetime(docs['created_time'], format="%Y-%m-%dT%H:%M:%S+0000") docs.set_index('created_time', inplace=True) docs.drop_duplicates(['message', 'user.name', 'post_id'], inplace=True) docs['n_sents'] = docs.message.apply(lambda x: len(sent_tokenize(x))) docs['n_words'] = docs.message.apply(lambda x: len(tokenize.word_tokenize(x))) docs = docs[docs['n_sents'] != 0].copy() mongo = Mongo('facebook', 'posts') posts = [doc for doc in mongo.collection.find()]
help="batch size for ingestion") args = parser.parse_args() path = pathlib.Path(args.path) files = list(path.glob("Gaia*.csv")) catalog_name = "Gaia_EDR3" log("Connecting to DB") m = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=args.v, ) log("Successfully connected") # Create indexes in the database: log("Creating indexes") # 2D position on the sphere, ID: m.db[catalog_name].create_index([("coordinates.radec_geojson", "2dsphere"), ("_id", 1)], background=True) m.db[catalog_name].create_index([("ra", 1), ("dec", 1), ("parallax", 1)], background=True) m.db[catalog_name].create_index(
def get_ops(): """ Fetch and ingest ZTF ops data """ # connect to MongoDB: print(f"{time_stamp()}: Connecting to DB.") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) print(f"{time_stamp()}: Successfully connected.") collection = "ZTF_ops" print(f"{time_stamp()}: Checking indexes.") mongo.db[collection].create_index( [("coordinates.radec_geojson", "2dsphere")], background=True) mongo.db[collection].create_index( [ ("utc_start", pymongo.ASCENDING), ("utc_end", pymongo.ASCENDING), ("fileroot", pymongo.ASCENDING), ], background=True, ) mongo.db[collection].create_index( [ ("jd_start", pymongo.ASCENDING), ("jd_end", pymongo.ASCENDING), ("fileroot", pymongo.ASCENDING), ], background=True, ) mongo.db[collection].create_index( [ ("jd_start", pymongo.DESCENDING), ("pid", pymongo.ASCENDING), ("field", pymongo.ASCENDING), ], background=True, ) # fetch full table print(f"{time_stamp()}: Fetching data.") url = config["ztf_ops"]["url"] r = requests.get( url, auth=(config["ztf_ops"]["username"], config["ztf_ops"]["password"]), verify=False, ) if r.status_code == requests.codes.ok: with open(os.path.join(config["path"]["tmp"], "allexp.tbl"), "wb") as f: f.write(r.content) else: raise Exception(f"{time_stamp()}: Failed to fetch allexp.tbl") latest = list(mongo.db[collection].find({}, sort=[["$natural", -1]], limit=1)) print(f"{time_stamp()}: Loading data.") df = pd.read_fwf( os.path.join(config["path"]["tmp"], "allexp.tbl"), comment="|", widths=[22, 4, 6, 4, 5, 8, 4, 9, 9, 7, 8, 29, 11, 25], header=None, names=[ "utc_start", "sun_elevation", "exp", "filter", "type", "field", "pid", "ra", "dec", "slew", "wait", "fileroot", "programpi", "qcomment", ], ) # drop comments: comments = df["utc_start"] == "UT_START" df = df.loc[~comments] for col in ["sun_elevation", "exp", "filter", "field", "pid"]: df[col] = df[col].apply(lambda x: int(x)) for col in ["ra", "dec", "slew", "wait"]: df[col] = df[col].apply(lambda x: float(x)) df["utc_start"] = df["utc_start"].apply( lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f")) df["utc_end"] = df["utc_start"].add( df["exp"].apply(lambda x: datetime.timedelta(seconds=x))) df["jd_start"] = df["utc_start"].apply(lambda x: datetime_to_jd(x)) df["jd_end"] = df["utc_end"].apply(lambda x: datetime_to_jd(x)) # drop rows with utc_start <= c['utc_start] if len(latest) > 0: new = df["jd_start"] > latest[0].get("jd_start", 0) if sum(new): print(f"{time_stamp()}: Found {sum(new)} new records.") df = df.loc[new] else: # no new data? take a nap... print(f"{time_stamp()}: No new data found.") # close connection to db mongo.client.close() print(f"{time_stamp()}: Disconnected from db.") return documents = df.to_dict("records") documents = [mongify(doc) for doc in documents] print(f"{time_stamp()}: Inserting {len(documents)} documents.") mongo.insert_many(collection=collection, documents=documents) # close connection to db mongo.client.close() print(f"{time_stamp()}: Disconnected from db.")
def run( path: str, num_proc: int = multiprocessing.cpu_count(), batch_size: int = 2048, rm: bool = False, dry_run: bool = False, ): """Preprocess and Ingest ZTF matchfiles into Kowalski :param path: local path to matchfiles :param tag: matchfile release time tag :param num_proc: number of processes for parallel ingestion :param batch_size: batch size for light curve data ingestion :param rm: remove matchfiles after ingestion? :param dry_run: dry run? :return: """ # connect to MongoDB: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) log("Successfully connected to DB") collections = { "exposures": "PTF_exposures", "sources": "PTF_sources", } # create indices: log("Creating indices") if not dry_run: mongo.db[collections["exposures"]].create_index( [("expid", pymongo.ASCENDING)], background=True) mongo.db[collections["sources"]].create_index( [("coordinates.radec_geojson", "2dsphere"), ("_id", pymongo.ASCENDING)], background=True, ) mongo.db[collections["sources"]].create_index( [ ("field", pymongo.ASCENDING), ("ccd", pymongo.ASCENDING), ("quad", pymongo.ASCENDING), ], background=True, ) mongo.db[collections["sources"]].create_index( [("nobs", pymongo.ASCENDING), ("_id", pymongo.ASCENDING)], background=True) files = [str(f) for f in pathlib.Path(path).glob("PTF_*.pytable")] log(f"# files to process: {len(files)}") input_list = [(f, collections, batch_size, rm, dry_run) for f in sorted(files)] # for a more even job distribution: random.shuffle(input_list) with multiprocessing.Pool(processes=num_proc) as pool: for _ in tqdm(pool.imap(process_file, input_list), total=len(files)): pass
def process_file(argument_list: Sequence): file_name, collections, batch_size, rm_file, dry_run = argument_list try: # connect to MongoDB: mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) with tables.open_file(file_name, "r+") as f: for group in f.walk_groups(): pass ff_basename = pathlib.Path(file_name).name # base id: _, field, filt, ccd, _, _ = ff_basename.split("_") field = int(field[1:]) filt = int(filt[1:]) readout_channel = int(ccd[1:]) baseid = int(1e13 + field * 1e9 + readout_channel * 1e7 + filt * 1e6) exposure_baseid = int(1e16 + field * 1e12 + readout_channel * 1e10 + filt * 1e9) def clean_up_document(group): """ Format passed in dicts for Mongo insertion """ document = {} for k, v in group.items(): if k == "matchedSourceID": document[k] = group[k] continue if k in sources_int_fields: document[k] = [ int(group[k][key2]) for key2 in group[k].keys() ] else: document[k] = [ float(group[k][key2]) for key2 in group[k].keys() ] # document["ra"] = document["ra"][0] # document["dec"] = document["dec"][0] # generate unique _id: document["_id"] = baseid + document["matchedSourceID"] document["filter"] = filt document["field"] = field document["ccd"] = ccd # GeoJSON for 2D indexing document["coordinates"] = dict() _ra = np.median(document["ra"]) _dec = np.median(document["dec"]) _radec_str = [deg2hms(_ra), deg2dms(_dec)] document["coordinates"]["radec_str"] = _radec_str # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [_ra - 180.0, _dec] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } document["data"] = [] for t, m, e, f, _ra, _dec in zip( document["mjd"], document["mag"], document["magErr"], document["ipacFlags"], document["ra"], document["dec"], ): data_point = { "mjd": t, "mag": m, "magerr": e, "ipacflags": f, "ra": _ra, "dec": _dec, } # convert types for pymongo: for k, v in data_point.items(): if k in sourcedata_int_fields: data_point[k] = int(data_point[k]) else: data_point[k] = float(data_point[k]) if k == "mjd": data_point[k] = round(data_point[k], 5) elif k not in ("ra", "dec"): data_point[k] = round(data_point[k], 3) document["data"].append(data_point) del ( document["mjd"], document["mag"], document["magErr"], document["ipacFlags"], document["ra"], document["dec"], ) document["data"].sort(key=lambda x: x["mjd"]) return document exposures = pd.DataFrame.from_records(group.exposures[:]) # prepare docs to ingest into db: docs_exposures = [] for index, row in exposures.iterrows(): try: doc = row.to_dict() # unique exposure id: doc["_id"] = exposure_baseid + doc["expid"] doc["matchfile"] = ff_basename doc["filter"] = filt doc["field"] = field doc["ccd"] = ccd docs_exposures.append(doc) except Exception as exception: log(str(exception)) # ingest exposures in one go: if not dry_run: mongo.insert_many(collection=collections["exposures"], documents=docs_exposures) sources = pd.DataFrame.from_records( group["sources"].read(), index="matchedSourceID", exclude=sources_fields_to_exclude, ) sourcedatas = pd.DataFrame.from_records( group["sourcedata"].read(), index="matchedSourceID", exclude=sourcedata_to_exclude, ) merged = sources.merge(sourcedatas, left_index=True, right_index=True) groups = merged.groupby("matchedSourceID") # light curves docs_sources = [] batch_num = 1 # fixme? skip transients for row, group in groups: try: groupdict = group.to_dict() groupdict["matchedSourceID"] = row current_doc = clean_up_document(groupdict) docs_sources.append(current_doc) except Exception as exception: log(str(exception)) # ingest in batches try: if len(docs_sources) % batch_size == 0 and len( docs_sources) != 0: if not dry_run: mongo.insert_many( collection=collections["sources"], documents=docs_sources, ) # flush: docs_sources = [] batch_num += 1 except Exception as exception: log(str(exception)) # ingest remaining while len(docs_sources) > 0: try: # In case mongo crashed and disconnected, docs will accumulate in documents # keep on trying to insert them until successful if not dry_run: mongo.insert_many(collection=collections["sources"], documents=docs_sources) # flush: docs_sources = [] except Exception as e: traceback.print_exc() log(e) log("Failed, waiting 5 seconds to retry") time.sleep(5) mongo.client.close() except Exception as e: traceback.print_exc() log(e) # if there was an error, return without potentially deleting the file return try: if rm_file: os.remove(file_name) finally: pass
import json import pandas as pd import tushare as ts from bson.json_util import dumps from datetime import date as Date from datetime import timedelta as Period from utils import Mongo as mongo_utils from utils import Stock as stock_utils client = mongo_utils.get_mongo_client() orig_db = client.tickdata db = client.stock start_date = Date(2018, 1, 13) end_date = Date(2016, 1, 1) TICK_COLLECTION = "tickdata" PAUSE = 10 RETRY = 10000 for col in orig_db.collection_names(): if '_' in col: code = col.split('_')[0] date = col.split('_')[1] print "start processing, code: " + code + ", date: " + date date = Date(int(date[0:4]), int(date[4:6]), int(date[6:8])) if date < start_date and date > end_date: cursor = orig_db[col].find({})
post_id, fields= 'reactions.type(LIKE).limit(0).summary(total_count).as(like),reactions.type(LOVE).limit(0).summary(total_count).as(love),reactions.type(WOW).limit(0).summary(total_count).as(wow),reactions.type(HAHA).limit(0).summary(total_count).as(haha),reactions.type(SAD).limit(0).summary(total_count).as(sad),reactions.type(ANGRY).limit(0).summary(total_count).as(angry),reactions.type(THANKFUL).limit(0).summary(total_count).as(thankful),reactions.type(NONE).limit(0).summary(total_count).as(total)' ) post.update(reactions) post = process_post(post) posts_list.append(post) # Adding extra bits from the guardian for post in posts_list: extra = get_extra(post['article_url']) if extra: post.update(extra) # Inserting posts collected in Mongo mongo = Mongo('facebook', 'posts') for post in posts_list: mongo.process_item(post) mongo.close() del mongo # Collect Comments data for idx, post in enumerate(posts_list): post_id = post['post_id'] print "Extracting %d comments for post %d ..." % ( post['comment_count'], idx) comments = graph.get_all_connections( post_id, 'comments', limit=100, fields='created_time,from,like_count,message,id,comment_count')
import json import tushare as ts from datetime import date as Date from datetime import timedelta as Period from utils import Mongo as mongo_utils from utils import Stock as stock_utils client = mongo_utils.get_mongo_client() db = client.stock start_date = Date(2018,1,12) end_date = Date(2017,12,1) PAUSE = 10 RETRY = 10000 cursor = db.tickdata.find({'code': '002117', 'date': '2017-11-02'}) data = mongo_utils.convert_cursor_to_dataframe(cursor) print data
def process_file(args): file, collection, batch_size, rm, verbose = args # connect to MongoDB: if verbose: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) if verbose: log("Successfully connected") collection = "Gaia_EDR3" if verbose: log(f"Processing {file}") for chunk_index, dataframe_chunk in enumerate( pd.read_csv(file, chunksize=batch_size)): if verbose: log(f"{file}: processing batch # {chunk_index + 1}") dataframe_chunk["_id"] = dataframe_chunk["source_id"].apply( lambda x: str(x)) batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict( orient="records") # pop nulls - save space batch = [{ key: value for key, value in document.items() if value not in ("DROPMEPLEASE", "NOT_AVAILABLE") } for document in batch] bad_document_indexes = [] for document_index, document in enumerate(batch): try: # GeoJSON for 2D indexing document["coordinates"] = dict() # string format: H:M:S, D:M:S document["coordinates"]["radec_str"] = [ deg2hms(document["ra"]), deg2dms(document["dec"]), ] # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [document["ra"] - 180.0, document["dec"]] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } except Exception as e: if verbose: log(str(e)) bad_document_indexes.append(document_index) if len(bad_document_indexes) > 0: if verbose: log("Removing bad docs") for index in sorted(bad_document_indexes, reverse=True): del batch[index] # ingest batch mongo.insert_many(collection=collection, documents=batch) # disconnect from db: try: mongo.client.close() finally: if verbose: log("Successfully disconnected from db") # clean up: if rm: os.remove(file) if verbose: log(f"Successfully removed {file}")
def process_file(argument_list: Sequence): file_name, collections, batch_size, rm_file, dry_run = argument_list try: # connect to MongoDB: mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) with tables.open_file(file_name, "r+") as f: group = f.root.matches ff_basename = pathlib.Path(file_name).name # base id: _, field, filt, ccd, quadrant, _ = ff_basename.split("_") field = int(field) filt = filters[filt] ccd = int(ccd[1:]) quadrant = int(quadrant[1:]) readout_channel = ccd_quad_to_rc(ccd=ccd, quad=quadrant) baseid = int(1e13 + field * 1e9 + readout_channel * 1e7 + filt * 1e6) exposure_baseid = int(1e16 + field * 1e12 + readout_channel * 1e10 + filt * 1e9) def clean_up_document(document): """ Format passed in dicts for Mongo insertion """ # convert types for pymongo: for k, v in document.items(): if k != "data": if k in sources_int_fields: document[k] = int(document[k]) else: document[k] = float(document[k]) if k not in ("ra", "dec"): # this will save a lot of space: document[k] = round(document[k], 3) # generate unique _id: document["_id"] = baseid + document["matchid"] document["filter"] = filt document["field"] = field document["ccd"] = ccd document["quad"] = quadrant document["rc"] = readout_channel # GeoJSON for 2D indexing document["coordinates"] = dict() _ra = document["ra"] _dec = document["dec"] _radec_str = [deg2hms(_ra), deg2dms(_dec)] document["coordinates"]["radec_str"] = _radec_str # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [_ra - 180.0, _dec] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } document["data"].sort(key=lambda x: x["hjd"]) for data_point in document["data"]: # convert types for pymongo: for k, v in data_point.items(): if k in sourcedata_int_fields: data_point[k] = int(data_point[k]) else: data_point[k] = float(data_point[k]) if k not in ("ra", "dec", "hjd"): data_point[k] = round(data_point[k], 3) elif k == "hjd": data_point[k] = round(data_point[k], 5) # generate unique exposure id's that match _id's in exposures collection data_point[ "uexpid"] = exposure_baseid + data_point["expid"] return document exposures = pd.DataFrame.from_records(group.exposures[:]) # prepare docs to ingest into db: docs_exposures = [] for index, row in exposures.iterrows(): try: doc = row.to_dict() # unique exposure id: doc["_id"] = exposure_baseid + doc["expid"] doc["matchfile"] = ff_basename doc["filter"] = filt doc["field"] = field doc["ccd"] = ccd doc["quad"] = quadrant doc["rc"] = readout_channel docs_exposures.append(doc) except Exception as exception: log(str(exception)) # ingest exposures in one go: if not dry_run: mongo.insert_many(collection=collections["exposures"], documents=docs_exposures) # light curves docs_sources = [] batch_num = 1 # fixme? skip transients # for source_type in ('source', 'transient'): for source_type in ("source", ): sources = pd.DataFrame.from_records( group[f"{source_type}s"].read(), index="matchid", exclude=sources_fields_to_exclude, ) # Load in percentiles separately to compute the IQR column # because Pandas DF from_records() only wants 2-D tables percentiles = group[f"{source_type}s"].col("percentiles") # Ignore float errors due to infinity values old_settings = np.seterr(all="ignore") iqr = np.round(percentiles[:, 8] - percentiles[:, 3], 3) np.seterr(**old_settings) sources["iqr"] = iqr sourcedatas = pd.DataFrame.from_records( group[f"{source_type}data"][:], index="matchid", exclude=[ "ypos", "xpos", "mjd", "psfflux", "psffluxerr", "magerrmodel", ], ) sourcedatas.rename(columns={ "ra": "ra_data", "dec": "dec_data" }, inplace=True) sourcedata_colnames = sourcedatas.columns.values # Join sources and their data merged = sources.merge(sourcedatas, left_index=True, right_index=True) prev_matchid = None current_doc = None for row in merged.itertuples(): matchid = row[0] try: # At a new source if matchid != prev_matchid: # Done with last source; save if current_doc is not None: current_doc = clean_up_document(current_doc) docs_sources.append(current_doc) # Set up new doc doc = dict(row._asdict()) doc["matchid"] = doc["Index"] doc.pop("Index") # Coerce the source data info into its own nested array first_data_row = {} for col in sourcedata_colnames: if col not in ["dec_data", "ra_data"]: first_data_row[col] = doc[col] else: real_col = col.split("_data")[0] first_data_row[real_col] = doc[col] doc.pop(col) doc["data"] = [first_data_row] current_doc = doc # For continued source, just append new data row else: data_row = {} data = dict(row._asdict()) for col in sourcedata_colnames: if col not in ["dec_data", "ra_data"]: data_row[col] = data[col] else: real_col = col.split("_data")[0] data_row[real_col] = data[col] current_doc["data"].append(data_row) prev_matchid = matchid except Exception as exception: log(str(exception)) # ingest in batches try: if (len(docs_sources) % batch_size == 0 and len(docs_sources) != 0): if not dry_run: mongo.insert_many( collection=collections["sources"], documents=docs_sources, ) # flush: docs_sources = [] batch_num += 1 except Exception as exception: log(str(exception)) # Clean up and append the last doc if current_doc is not None: current_doc = clean_up_document(current_doc) docs_sources.append(current_doc) # ingest remaining while len(docs_sources) > 0: try: # In case mongo crashed and disconnected, docs will accumulate in documents # keep on trying to insert them until successful if not dry_run: mongo.insert_many(collection=collections["sources"], documents=docs_sources) # flush: docs_sources = [] except Exception as e: traceback.print_exc() log(e) log("Failed, waiting 5 seconds to retry") time.sleep(5) mongo.client.close() except Exception as e: traceback.print_exc() log(e) # if there was an error, return without potentially deleting the file return try: if rm_file: os.remove(file_name) finally: pass