def start_gathering(topics, filenames, seconds_between_calls=30): # Index for topics/filenames: # 0: news articles # 1: tweets articles_df = read_last_record(filenames[0], articles_export_columns) tweets_df = read_last_record(filenames[1], twitter_export_columns) # Publish date of last checked article if articles_df is None or len(articles_df) < 1: last_article_date = datetime.datetime.today() - tdelta(weeks=1) else: last_article_date = articles_df.loc[0, "date"] # ID of last checked tweet if tweets_df is None or len(tweets_df) < 1: last_tweet_id = 0 else: last_tweet_id = tweets_df.loc[0, "id"] while True: now = datetime.datetime.today() new_article_date = gather_new_articles(topics[0], last_article_date, filenames[0]) new_tweet_id = gather_new_tweets(topics[1], last_tweet_id, filenames[1]) # Update tracking variables if newer records have been found if new_article_date is not None: last_article_date = new_article_date if new_tweet_id is not None: last_tweet_id = new_tweet_id sleep_until(now + tdelta(seconds=seconds_between_calls))
def harvest_articles(topic, articles_filename, seconds_between_calls): articles_per_call = 100 articles_df = read_last_article(articles_filename) if articles_df is None or len(articles_df) < 1: last_checked = datetime.datetime.today() - tdelta(weeks=1) else: last_checked = articles_df.loc[0, "date"] while True: now = datetime.datetime.today() # get new articles tmp_articles = search_articles(topic, articles_per_call) new_articles = tmp_articles[tmp_articles["date"] > last_checked] print("Found {} new articles.".format(len(new_articles))) # Skip if no new articles found if len(new_articles) == 0: sleep_until(now + tdelta(seconds=seconds_between_calls)) continue # Sentiment new_articles["content"] = new_articles["url"].apply(get_content) new_articles[['sent_neg', 'sent_neu', 'sent_pos', 'sent_comp' ]] = new_articles["content"].apply(get_sentiment) new_articles[["topic_freq", "topic_density" ]] = new_articles["content"].apply(get_topic_freq, args=(topic, )) # Save articles export_articles(new_articles, articles_filename) print(new_articles[["date", "id", "sent_comp"]]) last_checked = now sleep_until(now + tdelta(seconds=seconds_between_calls))
def harvest_tweets(topic, tweets_filename, seconds_between_calls): tweets_per_call = 100 tweets_df = read_last_tweet(tweets_filename) if tweets_df is None or len(tweets_df) < 1: last_id = 0 else: last_id = tweets_df.loc[0, "id"] last_checked = None while True: now = datetime.datetime.today() # get new tweets tmp_tweets = search_tweets(topic, tweets_per_call) new_tweets = tmp_tweets[tmp_tweets["id"] > last_id] print("Found {} new tweets.".format(len(new_tweets))) # Skip if no new tweets found if len(new_tweets) == 0: sleep_until(now + tdelta(seconds=seconds_between_calls)) continue # Sentiment new_tweets[['sent_neg', 'sent_neu', 'sent_pos', 'sent_comp']] = new_tweets["text"].apply(get_sentiment) # Save tweets export_tweets(new_tweets, tweets_filename) print(new_tweets[["date", "id", "sent_comp"]]) last_checked = now last_id = new_tweets["id"].max() sleep_until(now + tdelta(seconds=seconds_between_calls))
def get_uframe_array(array_id, out_dir=None, exec_dpa=True, urlonly=False, alltimes=False, deltatype='days', deltaval=1, provenance=False, limit=True, uframe_base=UFrame(), file_format='netcdf'): """ Download NetCDF / JSON files for the most recent 1-day worth of data for telemetered and recovered data streams for the specified array_id. Args: array_id: name of the array out_dir: top-level directory destination for writing NetCDF / JSON files. Defaults to the current working directory. exec_dpa: set to False to NOT execute L1/L2 data product algorithms prior to download. Defaults to True Returns: urls: array of dictionaries containing the url, response code and reason """ fetched_urls = [] if deltatype not in _valid_relativedeltatypes: sys.stderr.write( 'Invalid dateutil.relativedelta type: {:s}\n'.format(deltatype)) sys.stderr.flush() return fetched_urls if not array_id: sys.stderr.write('Invalid array id specified\n') sys.stderr.flush() return if not urlonly and not out_dir: out_dir = os.path.realpath(os.curdir) if not urlonly and not os.path.exists(out_dir): sys.stdout.write('Creating output directory: {:s}\n'.format(out_dir)) sys.stdout.flush() try: os.makedirs(out_dir) except OSError as e: sys.stderr.write(str(e)) sys.stderr.flush() return # Make sure the array is in uFrame if not urlonly: sys.stdout.write('Fetching arrays ({:s})\n'.format(uframe_base)) sys.stdout.flush() arrays = get_arrays(array_id=array_id, uframe_base=uframe_base) if not arrays: sys.stderr.write( 'Array {:s} does not exist in uFrame\n'.format(array_id)) sys.stderr.flush() return array = arrays[0] if not urlonly: sys.stdout.write('{:s}: Array exists...\n'.format(array)) sys.stdout.flush() # Fetch the platforms on the array if not urlonly: sys.stdout.write( 'Fetching array platforms ({:s})\n'.format(uframe_base)) sys.stdout.flush() platforms = get_platforms(array, uframe_base=uframe_base) if not platforms: sys.stderr.write( '{:s}: No platforms found for specified array\n'.format(array)) sys.stderr.flush() return if limit == True: limit = 10000 # limit to 10000 points else: limit = -1 # no limit for platform in platforms: p_name = '{:s}-{:s}'.format(array, platform) if not urlonly: sys.stdout.write( '{:s}: Fetching platform data sensors ({:s})\n'.format( p_name, uframe_base)) sys.stdout.flush() sensors = get_platform_sensors(array, platform, uframe_base=uframe_base) if not sensors: sys.stderr.write( '{:s}: No data sensors found for this platform\n'.format( p_name)) sys.stderr.flush() continue if not urlonly: sys.stdout.write('{:s}: {:d} sensors fetched\n'.format( p_name, len(sensors))) sys.stdout.flush() if not urlonly: sys.stdout.write( 'Fetching platform sensors ({:s})\n'.format(uframe_base)) sys.stdout.flush() for sensor in sensors: # Fetch sensor metadata meta = get_sensor_metadata(array, platform, sensor, uframe_base=uframe_base) if not meta: sys.stderr.write( '{:s}: No metadata found for sensor: {:s}\n'.format( p_name, sensor)) sys.stderr.flush() continue for metadata in meta['times']: if alltimes: ts0 = metadata['beginTime'] ts1 = metadata['endTime'] else: dt1 = parser.parse(metadata['endTime']) if dt1.year < 2000: sys.stderr.write( '{:s}: Invalid metadata endTime: {:s}\n'.format( p_name, metadata['endTime'])) sys.stderr.flush() continue dt0 = dt1 - tdelta(**dict({deltatype: deltaval})) ts1 = metadata['endTime'] ts0 = '{:s}Z'.format( dt0.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3]) stream = metadata['stream'] method = metadata['method'] dest_dir = os.path.join(out_dir, p_name, method) if not urlonly else None fetched_url = fetch_uframe_time_bound_stream( uframe_base=uframe_base, subsite=array, node=platform, sensor=sensor, method=method, stream=stream, begin_datetime=ts0, end_datetime=ts1, file_format=file_format, exec_dpa=exec_dpa, urlonly=urlonly, dest_dir=dest_dir, provenance=provenance, limit=str(limit)) fetched_urls.append(fetched_url) return fetched_urls
def instrument_to_query(self, ref_des, telemetry=None, time_delta_type=None, time_delta_value=None, begin_ts=None, end_ts=None, time_check=True, exec_dpa=True, application_type='netcdf', provenance=True, limit=-1, annotations=False, user=None, email=None): '''Return the list of request urls that conform to the UFrame API for the specified reference_designator. Parameters: ref_des: partial or fully-qualified reference designator telemetry: telemetry type (Default is all telemetry types time_delta_type: Type for calculating the subset start time, i.e.: years, months, weeks, days. Must be a type kwarg accepted by dateutil.relativedelta' time_delta_value: Positive integer value to subtract from the end time to get the start time for subsetting. begin_dt: ISO-8601 formatted datestring specifying the dataset start time end_dt: ISO-8601 formatted datestring specifying the dataset end time exec_dpa: boolean value specifying whether to execute all data product algorithms to return L1/L2 parameters (Default is True) application_type: 'netcdf' or 'json' (Default is 'netcdf') provenance: boolean value specifying whether provenance information should be included in the data set (Default is True) limit: integer value ranging from -1 to 10000. A value of -1 (default) results in a non-decimated dataset annotations: boolean value (True or False) specifying whether to include all dataset annotations ''' urls = [] instruments = self.search_instruments(ref_des) if not instruments: return urls self._port = 12576 self._url = '{:s}:{:d}/sensor/inv'.format(self._base_url, self._port) if time_delta_type and time_delta_value: if time_delta_type not in _valid_relativedeltatypes: sys.stderr.write('Invalid dateutil.relativedelta type: {:s}\n'.format(time_delta_type)) sys.stderr.flush() return urls begin_dt = None end_dt = None if begin_ts: try: begin_dt = parser.parse(begin_ts) except ValueError as e: sys.stderr.write('Invalid begin_dt: {:s} ({:s})\n'.format(begin_ts, e.message)) sys.stderr.flush() return urls if end_ts: try: end_dt = parser.parse(end_ts) except ValueError as e: sys.stderr.write('Invalid end_dt: {:s} ({:s})\n'.format(end_ts, e.message)) sys.stderr.flush() return urls for instrument in instruments: # Validate the reference designator format if not self.validate_reference_designator(instrument): sys.stderr.write('Invalid format for reference designator: {:s}\n'.format(instrument)) sys.stderr.flush() continue #sys.stdout.write('Instrument: {:s}\n'.format(instrument)) # Store the metadata for this instrument meta = self.toc[instrument] # Break the reference designator up r_tokens = instrument.split('-') for stream in meta['streams']: #sys.stdout.write('Stream: {:s}\n'.format(stream['stream'])) if telemetry and stream['method'].find(telemetry) == -1: continue #Figure out what we're doing for time dt0 = None dt1 = None stream_dt0 = parser.parse(stream['beginTime']) stream_dt1 = parser.parse(stream['endTime']) if time_delta_type and time_delta_value: dt1 = stream_dt1 dt0 = dt1 - tdelta(**dict({time_delta_type : time_delta_value})) else: if begin_dt: dt0 = begin_dt else: dt0 = stream_dt0 if end_dt: dt1 = end_dt else: dt1 = stream_dt1 # Format the endDT and beginDT values for the query try: ts1 = dt1.strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError as e: sys.stderr.write('{:s}-{:s}: {:s}\n'.format(instrument, stream['stream'], e.message)) continue try: ts0 = dt0.strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError as e: sys.stderr.write('{:s}-{:s}: {:s}\n'.format(instrument, stream['stream'], e.message)) continue # Make sure the specified or calculated start and end time are within # the stream metadata times if time_check=True if time_check: if dt1 > stream_dt1: sys.stderr.write('time_check ({:s}): End time exceeds stream endTime ({:s} > {:s})\n'.format(stream['stream'], ts1, stream['endTime'])) sys.stderr.write('time_check ({:s}): Setting request end time to stream endTime\n'.format(stream['stream'])) sys.stderr.flush() ts1 = stream['endTime'] if dt0 < stream_dt0: sys.stderr.write('time_check ({:s}): Start time is earlier than stream beginTime ({:s} < {:s})\n'.format(stream['stream'], ts0, stream['beginTime'])) sys.stderr.write('time_check ({:s}): Setting request begin time to stream beginTime\n'.format(stream['stream'])) ts0 = stream['beginTime'] # Check that ts0 < ts1 dt0 = parser.parse(ts0) dt1 = parser.parse(ts1) if dt0 >= dt1: sys.stderr.write('{:s}: Invalid time range specified ({:s} >= {:s})\n'.format(stream['stream'], ts0, ts1)) continue # Create the url stream_url = '{:s}/{:s}/{:s}/{:s}-{:s}/{:s}/{:s}?beginDT={:s}&endDT={:s}&format=application/{:s}&limit={:d}&execDPA={:s}&include_provenance={:s}'.format( self.url, r_tokens[0], r_tokens[1], r_tokens[2], r_tokens[3], stream['method'], stream['stream'], ts0, ts1, application_type, limit, str(exec_dpa).lower(), str(provenance).lower()) if user: stream_url = '{:s}&user={:s}'.format(stream_url, user) if email: stream_url = '{:s}&email={:s}'.format(stream_url, email) urls.append(stream_url) return urls
def get_uframe_array(array_id, out_dir=None, exec_dpa=True, urlonly=False, deltatype='days', deltaval=1, provenance=False, limit=True, uframe_base=UFrame(), file_format='netcdf'): """ Download NetCDF / JSON files for the most recent 1-day worth of data for telemetered and recovered data streams for the specified array_id. Args: array_id: name of the array out_dir: top-level directory destination for writing NetCDF / JSON files. Defaults to the current working directory. exec_dpa: set to False to NOT execute L1/L2 data product algorithms prior to download. Defaults to True Returns: urls: array of dictionaries containing the url, response code and reason """ fetched_urls = [] if deltatype not in _valid_relativedeltatypes: sys.stderr.write('Invalid dateutil.relativedelta type: {:s}\n'.format(deltatype)) sys.stderr.flush() return fetched_urls if not array_id: sys.stderr.write('Invalid array id specified\n') sys.stderr.flush() return if not urlonly and not out_dir: out_dir = os.path.realpath(os.curdir) if not urlonly and not os.path.exists(out_dir): sys.stdout.write('Creating output directory: {:s}\n'.format(out_dir)) sys.stdout.flush() try: os.makedirs(out_dir) except OSError as e: sys.stderr.write(str(e)) sys.stderr.flush() return # Make sure the array is in uFrame if not urlonly: sys.stdout.write('Fetching arrays ({:s})\n'.format(uframe_base)) sys.stdout.flush() arrays = get_arrays(array_id=array_id, uframe_base=uframe_base) if not arrays: sys.stderr.write('Array {:s} does not exist in uFrame\n'.format(array_id)) sys.stderr.flush() return array = arrays[0] if not urlonly: sys.stdout.write('{:s}: Array exists...\n'.format(array)) sys.stdout.flush() # Fetch the platforms on the array if not urlonly: sys.stdout.write('Fetching array platforms ({:s})\n'.format(uframe_base)) sys.stdout.flush() platforms = get_platforms(array, uframe_base=uframe_base) if not platforms: sys.stderr.write('{:s}: No platforms found for specified array\n'.format(array)) sys.stderr.flush() return if limit == True: limit = 10000 # limit to 10000 points else: limit = -1 # no limit for platform in platforms: p_name = '{:s}-{:s}'.format(array, platform) if not urlonly: sys.stdout.write('{:s}: Fetching platform data sensors ({:s})\n'.format(p_name, uframe_base)) sys.stdout.flush() sensors = get_platform_sensors(array, platform, uframe_base=uframe_base) if not sensors: sys.stderr.write('{:s}: No data sensors found for this platform\n'.format(p_name)) sys.stderr.flush() continue if not urlonly: sys.stdout.write('{:s}: {:d} sensors fetched\n'.format(p_name, len(sensors))) sys.stdout.flush() if not urlonly: sys.stdout.write('Fetching platform sensors ({:s})\n'.format(uframe_base)) sys.stdout.flush() for sensor in sensors: # Fetch sensor metadata meta = get_sensor_metadata(array, platform, sensor, uframe_base=uframe_base) if not meta: sys.stderr.write('{:s}: No metadata found for sensor: {:s}\n'.format(p_name, sensor)) sys.stderr.flush() continue for metadata in meta['times']: dt1 = parser.parse(metadata['endTime']) if dt1.year < 2000: sys.stderr.write('{:s}: Invalid metadata endTime: {:s}\n'.format(p_name, metadata['endTime'])) sys.stderr.flush() continue dt0 = dt1 - tdelta(**dict({deltatype : deltaval})) ts1 = metadata['endTime'] ts0 = dt0.strftime('%Y-%m-%dT%H:%M:%S.%fZ') stream = metadata['stream'] method = metadata['method'] dest_dir = os.path.join(out_dir, p_name, method) if not urlonly else None fetched_url = fetch_uframe_time_bound_stream( uframe_base = uframe_base, subsite = array, node = platform, sensor = sensor, method = method, stream = stream, begin_datetime = ts0, end_datetime = ts1, file_format = file_format, exec_dpa = exec_dpa, urlonly = urlonly, dest_dir = dest_dir, provenance = provenance, limit = str(limit) ) fetched_urls.append(fetched_url) return fetched_urls
def instrument_to_query(self, ref_des, user, stream=None, telemetry=None, time_delta_type=None, time_delta_value=None, begin_ts=None, end_ts=None, time_check=True, exec_dpa=True, application_type='netcdf', provenance=True, limit=-1, annotations=False, email=None): """Return the list of request urls that conform to the UFrame API for the specified fully or paritally-qualified reference_designator. Request urls are formatted for either the UFrame m2m API (default) or direct UFrame access, depending on is_m2m property of the UFrameClient instance. Arguments: ref_des: partial or fully-qualified reference designator stream: restrict urls to the specified stream user: user name for the query Optional kwargs: telemetry: telemetry type (Default is all telemetry types time_delta_type: Type for calculating the subset start time, i.e.: years, months, weeks, days. Must be a type kwarg accepted by dateutil.relativedelta' time_delta_value: Positive integer value to subtract from the end time to get the start time for subsetting. begin_ts: ISO-8601 formatted datestring specifying the dataset start time end_ts: ISO-8601 formatted datestring specifying the dataset end time time_check: set to true (default) to ensure the request times fall within the stream data availability exec_dpa: boolean value specifying whether to execute all data product algorithms to return L1/L2 parameters (Default is True) application_type: 'netcdf' or 'json' (Default is 'netcdf') provenance: boolean value specifying whether provenance information should be included in the data set (Default is True) limit: integer value ranging from -1 to 10000. A value of -1 (default) results in a non-decimated dataset annotations: boolean value (True or False) specifying whether to include all dataset annotations """ urls = [] instruments = self.search_instruments(ref_des) if not instruments: return urls if time_delta_type and time_delta_value: if time_delta_type not in _valid_relativedeltatypes: self._logger.error( 'Invalid dateutil.relativedelta type: {:s}'.format( time_delta_type)) return urls begin_dt = None end_dt = None if begin_ts: try: begin_dt = parser.parse(begin_ts).replace(tzinfo=pytz.UTC) except ValueError as e: self._logger.error('Invalid begin_dt: {:s} ({:s})'.format( begin_ts, e.message)) return urls if end_ts: try: end_dt = parser.parse(end_ts).replace(tzinfo=pytz.UTC) except ValueError as e: self._logger.error('Invalid end_dt: {:s} ({:s})'.format( end_ts, e.message)) return urls for instrument in instruments: # Get the streams produced by this instrument instrument_streams = self.fetch_instrument_streams(instrument) if not instrument_streams: self._logger.info( 'No streams found for {:s}'.format(instrument)) continue if stream: stream_names = [s['stream'] for s in instrument_streams] if stream not in stream_names: self._logger.warning('Invalid stream: {:s}-{:s}'.format( instrument, stream)) continue instrument_streams = [ s for s in instrument_streams if s['stream'] == stream ] # i = stream_names.index(stream) # instrument_streams = [instrument_streams[i]] if not instrument_streams: self._logger.info('{:s}: No streams found'.format(instrument)) continue # Break the reference designator up r_tokens = instrument.split('-') for instrument_stream in instrument_streams: if telemetry and not instrument_stream['method'].startswith( telemetry): continue # Figure out what we're doing for time try: stream_dt0 = parser.parse(instrument_stream['beginTime']) except ValueError: self._logger.error( '{:s}-{:s}: Invalid beginTime ({:s})'.format( instrument, instrument_stream['stream'], instrument_stream['beginTime'])) continue try: stream_dt1 = parser.parse(instrument_stream['endTime']) # Add 1 second to stream end time to account for milliseconds stream_dt1 = stream_dt1 + tdelta(seconds=1) except ValueError: self._logger.error( '{:s}-{:s}: Invalid endTime ({:s})'.format( 'instrument', instrument_stream['stream'], instrument_stream['endTime'])) continue if time_delta_type and time_delta_value: dt1 = stream_dt1 dt0 = dt1 - tdelta( **dict({time_delta_type: time_delta_value})) else: if begin_dt: dt0 = begin_dt else: dt0 = stream_dt0 if end_dt: dt1 = end_dt else: dt1 = stream_dt1 # Format the endDT and beginDT values for the query try: ts1 = dt1.strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError as e: self._logger.error('{:s}-{:s}: {:s}'.format( instrument, instrument_stream['stream'], e.message)) continue try: ts0 = dt0.strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError as e: self._logger.error('{:s}-{:s}: {:s}'.format( instrument, instrument_stream['stream'], e.message)) continue # Make sure the specified or calculated start and end time are within # the stream metadata times if time_check=True if time_check: if dt1 > stream_dt1: self._logger.warning( '{:s}-{:s} time check - End time exceeds stream endTime' .format(ref_des, instrument_stream['stream'])) self._logger.warning( '{:s}-{:s} time check - Setting request end time to stream endTime' .format(ref_des, instrument_stream['stream'])) ts1 = instrument_stream['endTime'] if dt0 < stream_dt0: self._logger.warning( '{:s}-{:s} time check - Start time is earlier than stream beginTime' .format(ref_des, instrument_stream['stream'])) self._logger.warning( '{:s}-{:s} time check - Setting request begin time to stream beginTime' .format(ref_des, instrument_stream['stream'])) ts0 = instrument_stream['beginTime'] # Check that ts0 < ts1 dt0 = parser.parse(ts0) dt1 = parser.parse(ts1) if dt0 >= dt1: self._logger.warning( '{:s}-{:s} - Invalid time range specified'.format( instrument, instrument_stream['stream'])) continue # Create the url end_point = 'sensor/inv/{:s}/{:s}/{:s}-{:s}/{:s}/{:s}?beginDT={:s}&endDT={:s}&format=application/{:s}&limit={:d}&execDPA={:s}&include_provenance={:s}&user={:s}'.format( r_tokens[0], r_tokens[1], r_tokens[2], r_tokens[3], instrument_stream['method'], instrument_stream['stream'], ts0, ts1, application_type, limit, str(exec_dpa).lower(), str(provenance).lower(), user) if email: end_point = '{:s}&email={:s}'.format(end_point, email) urls.append(self.build_request(12576, end_point)) return urls