def sampleWordIDs(numSamples=1, wordLength=5):
    # TODO: Implement limits:
    # 1) data have same periodicity
    # 2) data come from certain supertype
    # 2) data come from certain subtype
    # 2) data come from certain type
    # 2) data come from certain data_master_ids
    # 2) data come from differing data_master_ids (i.e. no way to have same data_master_id)
    allIDs = EM_DBHelp.retrieve_AllWordSeriesIDs(db_connection, db_cursor)
    words = []
    for i in range(numSamples):
        words.append(sample(allIDs, wordLength))

    wordSeriesIDs = {ID for word in words for ID in word}

    getFromDB = lambda (seriesID): EM_DBHelp.retrieve_WordSeriesMetaData(
        db_connection, db_cursor, columnName, seriesID=seriesID
    )

    columnName = "dt_latest_word"
    dates = [getFromDB(ID) for ID in wordSeriesIDs]
    latestDate = min(dates)

    columnName = "dt_earliest_word"
    dates = [getFromDB(ID) for ID in wordSeriesIDs]
    earliestDate = max(dates)

    columnName = "int_data_master_id"
    dataSeriesIDs = [getFromDB(ID) for ID in wordSeriesIDs]

    return (words, wordSeriesIDs, dataSeriesIDs, earliestDate, latestDate)
def downloadDataFromFred_fillHistory_parseSeriesData(dataSeriesTicker, data):
	dataSeriesID =	EM_DBHelp.retrieve_DataSeriesID(db_connection, db_cursor, 
													dataTicker=dataSeriesTicker, 
													insertIfNot=True)
	sendToDB = lambda: \
	EM_DBHelp.update_DataSeriesMetaData(db_connection, db_cursor, columnName, value, seriesID=dataSeriesID)

	successfulInserts = 0
	unsuccessfulInserts = 0
	minObsDate = maxint
	maxObsDate = -maxint-1
	# Update History
	for obs in data['observations']:
		if obs['value'] == '.':
			continue # Avoid Unfilled FRED Values
		date = EM_util.dtConvert_YYYY_MM_DDtoEpoch(obs['date'])
		value = float(obs['value'])
		maxObsDate = date if date > maxObsDate else maxObsDate
		minObsDate = date if date < minObsDate else minObsDate
		log.info('\t\tWriting Historical Data Point at %s for %s [value = %f]', dataSeriesTicker, obs['date'], value)
		(success, error) = EM_DBHelp.insertDataPoint_DataHistoryTable(	
													db_connection, db_cursor,  
													dataSeriesID, 
													date, 
													value, 
													isInterpolated=False)
		if not success:
			log.warning('\t\tFailed to Write Historical Data Point at %s for %s [value = %f]', dataSeriesTicker, obs['date'], obs['value'])
			unsuccessfulInserts += 1
		else:
			successfulInserts +=1
	
	log.info('\t%d Historical Data Points Written Successfuly for %s', successfulInserts, dataSeriesTicker)
	log.info('\t%d Historical Data Points Written Failed for %s', unsuccessfulInserts, dataSeriesTicker)
	log.info('\tDate range from %s to %s Written for %s', EM_util.dtConvert_EpochtoY_M_D(minObsDate), EM_util.dtConvert_EpochtoY_M_D(maxObsDate), dataSeriesTicker)

	# Update Successful Inserts Flag
	columnName = 'int_unsuccessful_inserts'
	value = unsuccessfulInserts
	sendToDB()

	# Update Earliest DataPoint Found
	columnName = 'dt_earliest_value'
	value = minObsDate
	sendToDB()

	# Update Latest DataPoint Found
	columnName = 'dt_latest_value'
	value = maxObsDate
	sendToDB()

	# Update Last Update Date
	columnName = 'dt_last_updated_history'
	value = EM_util.dtGetNowAsEpoch()
	sendToDB()
	def insertWords(self, dates, values):
		assert len(dates) == len(values)
		assert self.wordSeriesID is not None

		minDate = maxint
		maxDate = -maxint - 1

		successfulInserts = 0
		unsuccessfulInserts = 0

		for i in range(values):
			minDate = min(minDate, dates[i])
			maxDate = max(maxDate, dates[i])

			(success, error) = EM_DBHelp.insertDataPoint_WordHistoryTable(	self.db_conn, self.db_curs, 
																			self.wordSeriesID, 
																			dates[i], 
																			values[i])
			if not success:
				log.warning('Failed to Write Historical Word at %s for %s [value = %f]', wordTicker, date, value)
				unsuccessfulInserts += 1
			else:
				successfulInserts +=1
	
		log.debug('%d Historical Words Written Successfuly for %s', successfulInserts, wordTicker)
		log.debug('%d Historical Words Writing Failed for %s', unsuccessfulInserts, wordTicker)
		log.debug('Date range from %s to %s Written for %s', EM_util.dtConvert_EpochtoY_M_D(minDate), EM_util.dtConvert_EpochtoY_M_D(maxDate), wordTicker)

		self.__sendToDB('int_unsuccessful_generations', unsuccessfulInserts)
		self.__sendToDB('dt_earliest_word', minDate)
		self.__sendToDB('dt_latest_word', maxDate)
		self.__sendToDB('dt_last_generated', EM_util.dtGetNowAsEpoch())
	def findAndStoreDataSeries(self, ticker):
		assert self.dataSeriesID is None # Force reset before setting (Or new object)

		self.dataSeriesID = EM_DBHelp.retrieve_DataSeriesID(	self.db_conn, self.db_curs, 
																dataTicker=ticker, 
																insertIfNot=False)
		self.dataPeriodicity = self.__getFromDB('int_data_periodicity')
def convertDataHistoryToJSON(ticker, dataSeriesID=None):
	global file_format

	if dataSeriesID is None:
		dataSeriesID = EM_DBHelp.retrieve_DataSeriesID(	db_conn, db_curs, 
														dataTicker=ticker, 
														insertIfNot=False)

	dataSeries = EM_DBHelp.getCompleteDataHistory_DataHistoryTable(	db_conn, db_curs, dataSeriesID)

	if file_format[0] == 'rs':
		convertValue = lambda (dt, vl): {'x': dt, 'y': float(vl)}
	elif file_format[0] == 'd3':
		convertValue = lambda (dt, vl): {'date': dtConvert_EpochtoY_M_D(dt), 'value': float(vl)}
	else:
		raise Exception('File Format Type not recognized for date/value JSON')
	return map(convertValue, dataSeries)
	def findAndStoreWordSeries(self, wordTicker, wordSubType, wordType, wordSuperType, insertIfNot=True):
		assert self.wordSeriesID is None  # Force reset before setting (Or new object)
		assert self.dataSeriesID is not None

		self.wordSeriesID = EM_DBHelp.retrieve_WordSeriesID(	self.db_conn, self.db_curs, 
																self.dataSeriesID, wordType, wordSubType, wordSuperType,
																wordTicker=wordTicker, 
																insertIfNot=True)
		self.__sendToDB('int_word_periodicity', self.dataPeriodicity)
def generateWordsFromIndices(wordSeriesIDs, earliestDate, latestDate):
    wordSize = len(wordSeriesIDs)
    # Get Data Series
    words = None
    dates = None
    for i in range(wordSize):
        data = EM_DBHelp.getWordHistory_WordHistoryTable(
            db_connection, db_cursor, wordSeriesIDs[i], beforeDate=latestDate, afterDate=earliestDate
        )
        dt = np.dtype("int,float")
        data = np.array(data, dtype=dt)
        data.dtype.names = ["dates", "values"]
        if words is None:
            words = np.empty((len(data), wordSize))
            dates = data["dates"]
        if not (data["dates"] == dates).all():
            raise Exception("Dates for different series do not match up")
        words[:, i] = data["values"]
        # Tuplefy words
    numWords = words.shape[0]
    for j in range(numWords):
        words[j, :] = tuple(words[j, :])
        # Return
    return (dates, words)
def writeDataHistorytoJSON(ticker):
	dataSeriesID = EM_DBHelp.retrieve_DataSeriesID(	db_conn, db_curs, 
													dataTicker=ticker, 
													insertIfNot=False)
	
	data_JSON = convertDataHistoryToJSON(ticker, dataSeriesID=dataSeriesID)

	columnName = 'dt_earliest_value'
	earliestData = EM_DGUtil.getFromDB()
	columnName = 'dt_latest_value'
	latestData = EM_DGUtil.getFromDB()
	columnName = 'dt_last_updated_history'
	lastUpdate = EM_DGUtil.getFromDB()

	writeFile = generateDateHistoryFilename(ticker, earliestData, latestData, lastUpdate)
	updateAvailableSeriesDataFile(ticker, earliestData, latestData, lastUpdate, dataFileName=writeFile)

	try:
		writer = open(writeFile, 'wb')
		writer.write(json_dump(data_JSON))
	except:
		raise
	finally:
		writer.close()
	def getDataHistory(self, dataType='float'):
		dataSeries = EM_DBHelp.getCompleteDataHistory_DataHistoryTable(self.db_conn, self.db_curs, self.dataSeriesID)
		dt = np.dtype('int,'+dataType) # Format is, e.g.  'int,int' or 'int,float'
		dataSeries = np.array(dataSeries, dtype=dt)
		dataSeries.dtype.names = ['dates','values']		
		return dataSeries
Esempio n. 10
0
	def __sendToDB(self, columnName, value):
		return EM_DBHelp.update_WordSeriesMetaData(db_conn, db_curs, columnName, value, seriesID=wordSeriesID)
Esempio n. 11
0
	def __getFromDB(self, columnName):
		return EM_DBHelp.retrieve_DataSeriesMetaData(self.db_conn, self.db_curs, columnName, seriesID=self.dataSeriesID)
Esempio n. 12
0
firstOrderDiffPeriodOverride = []

def findFirstOrderDifferences(data, differenceLength):
	firstOrderDiffData = data[differenceLength:] - data[:-differenceLength]
	return findNormalDistRange(firstOrderDiffData)

def findNormalDistRange(data):
	func_round = np.vectorize(round)
	mean = np.mean(data)
	stds = np.std(data)
	stdVols = (data - mean)/stds
	return func_round(stdVols - 0.5*np.sign(stdVols))


getFromDB = lambda: EM_DBHelp.retrieve_DataSeriesMetaData(db_conn, db_curs, columnName, seriesID=dataSeriesID)
sendToDB = lambda: EM_DBHelp.update_WordSeriesMetaData(db_conn, db_curs, columnName, value, seriesID=wordSeriesID)	

class EMF_DataGenerator_Handle:
	def __init__(self, db_connection, db_cursor):
		self.db_conn = db_connection
		self.db_curs = db_cursor
		self.wordSeriesID = None
		self.dataSeriesID = None

	def __getFromDB(self, columnName):
		return EM_DBHelp.retrieve_DataSeriesMetaData(self.db_conn, self.db_curs, columnName, seriesID=self.dataSeriesID)

	def __sendToDB(self, columnName, value):
		return EM_DBHelp.update_WordSeriesMetaData(db_conn, db_curs, columnName, value, seriesID=wordSeriesID)
def downloadDataFromFred(	csvFileName=EM_util.FREDSeriesCSV, 
							fillHistory=True, 
							fillMetadata=True, 
							fillUserData=True, 
							minImportance=2, 
							writeEveryRow=True,
							pause=False):
	# Access Global Variables
	global db_cursor
	global db_connection

	# Establish Helpful Lambdas
	sendToDB = lambda: \
	EM_DBHelp.update_DataSeriesMetaData(db_connection, db_cursor, columnName, value, seriesID=dataSeriesID)

	# Read CSV file
	log.info('Accessing CSV to get Series Tickers...')
	with open(csvFileName, 'rU') as csvfile:
		series_csv = csv.reader(csvfile)
		header = [next(series_csv)] # Ignore header in CSV
		write_list = [list(row) for row in series_csv]
		log.info('Downloading FRED data to database...')
		for i in range(len(write_list)):
			# Recognize End of File without Reaching Deprecated
			if write_list[i][FREDCol['TICKER_COL']] == '':
				break
			if int(write_list[i][FREDCol['IMPORTANCE_COL']]) > minImportance:
				continue
			lastHistoryDownload = 	datetime.datetime.strptime(
									write_list[i][FREDCol['HISTORY_TIMESTAMP_COL']],
									EM_util.FREDSeriesCSVTimeFormat)
			dnldHistory = 	((fillHistory and \
							(datetime.datetime.now()-lastHistoryDownload) > EM_util.FREDDownloadDelayHistory) or \
							write_list[i][FREDCol['FORCE_HISTORY_REDOWNLOAD_COL']] == EM_util.FREDForceDownload)  and \
							write_list[i][FREDCol['FORCE_HISTORY_REDOWNLOAD_COL']] != EM_util.FREDSkipDownload

			lastMetadataDownload = 	datetime.datetime.strptime(
									write_list[i][FREDCol['HISTORY_TIMESTAMP_COL']],
									EM_util.FREDSeriesCSVTimeFormat)
			dnldMetadata = 	((fillMetadata and\
							(datetime.datetime.now()-lastMetadataDownload) > EM_util.FREDDownloadDelayMetadata) or \
							write_list[i][FREDCol['FORCE_METADATA_REDOWNLOAD_COL']] == EM_util.FREDForceDownload)  and \
							write_list[i][FREDCol['FORCE_METADATA_REDOWNLOAD_COL']] != EM_util.FREDSkipDownload

			ticker = write_list[i][FREDCol['TICKER_COL']]
			log.info('Downloading %s data to database...', ticker)
			if dnldHistory:
				status = downloadDataFromFred_fillHistory(ticker)
				write_list[i][FREDCol['HISTORY_STATUS_COL']] = status
				write_list[i][FREDCol['HISTORY_TIMESTAMP_COL']] = \
					datetime.datetime.now().strftime(EM_util.FREDSeriesCSVTimeFormat)
			if dnldMetadata:
				status = downloadDataFromFred_fillMetadata(ticker)
				write_list[i][FREDCol['METADATA_STATUS_COL']] = status
				write_list[i][FREDCol['METADATA_TIMESTAMP_COL']] = \
					datetime.datetime.now().strftime(EM_util.FREDSeriesCSVTimeFormat)
			# TODO: Decide when to fillUserData
			if fillUserData:
				# We seek the Series ID again to make sure it was input correctly earlier
				# (Rather than just passing it back from earlier)	
				dataSeriesID = EM_DBHelp.retrieve_DataSeriesID(db_connection, db_cursor, 
																dataTicker=ticker, 
																insertIfNot=False)
				columnName = 'code_data_subtype'
				csvVal = write_list[i][FREDCol['SUBTYPE_COL']]
				if csvVal == '': csvVal = 'unknown'			
				value = int(EM_util.dataSubtypes[csvVal])
				sendToDB()
				
				columnName = 'code_data_type'
				csvVal = write_list[i][FREDCol['TYPE_COL']]
				if csvVal == '': csvVal = 'unknown'
				value = int(EM_util.dataTypes[csvVal])
				sendToDB()
				
				columnName = 'code_data_supertype'
				csvVal = write_list[i][FREDCol['SUPERTYPE_COL']]
				if csvVal == '': csvVal = 'unknown'
				value = int(EM_util.dataSupertypes[csvVal])
				sendToDB()

				columnName = 'code_is_level'
				csvVal = write_list[i][FREDCol['IS_LEVEL_COL']]
				if csvVal == '': csvVal = 'unknown'
				value = int(EM_util.levelMattersTypes[csvVal])
				sendToDB()

				columnName = 'code_good_direction'
				csvVal = write_list[i][FREDCol['POSITIVE_IS_GOOD_COL']]
				if csvVal == '': csvVal = 'unknown'
				value = int(EM_util.goodDirectionTypes[csvVal])
				sendToDB()
			# TODO: This is hacky. Do better.
			if writeEveryRow:
				with open(csvFileName[:-4]+'_temp.csv', 'wb') as milestone_csvfile:
					log.info('Updating Series CSV File...')
					csvwrite = csv.writer(milestone_csvfile)
					csvwrite.writerows(header)
					csvwrite.writerows(write_list)
					log.info('CSV file updated...')

			# For Testing... TODO: Remove
			if pause: 
				statement = raw_input('\nPress Enter to Continue...\n') 

		log.info('Downloaded Series data sucessfully...')

	with open(csvFileName, 'wb') as csvfile:
		log.info('Updating Series CSV File...')
		csvwrite = csv.writer(csvfile)
		csvwrite.writerows(header)
		csvwrite.writerows(write_list)
		log.info('CSV file updated...')
def downloadDataFromFred_fillMetadata_parseSeriesData(dataSeriesTicker, data):
	#Get Data Series ID
	dataSeriesID =	EM_DBHelp.retrieve_DataSeriesID(db_connection, db_cursor, 
													dataTicker=dataSeriesTicker, 
													insertIfNot=True)
	sendToDB = lambda: \
	EM_DBHelp.update_DataSeriesMetaData(db_connection, db_cursor, columnName, value, seriesID=dataSeriesID)

	# Update Data Series Name
	columnName = 'txt_data_name'
	value = data['title'].encode('ascii', 'ignore')
	sendToDB()

	columnName = 'txt_data_source'
	value = 'FRED'
	sendToDB()

	# Update Data Series Periodicity
	dataFrequency = data['frequency_short']
	if dataFrequency == 'M':
		value = 12
	elif dataFrequency == 'Q':
		value = 4
	elif dataFrequency == 'A':
		value = 1
	elif dataFrequency == 'W':
		value = 52		
	elif dataFrequency == 'D':
		value = 365
	else:
		raise NotImplementedError('Data Frequency not recognized')
	columnName = 'int_data_periodicity'
	sendToDB()

	# Update Data Series Seasonal Adjustment
	dataSeasonalAdjustment = data['seasonal_adjustment_short']
	if dataSeasonalAdjustment == 'SA' or dataSeasonalAdjustment == 'SAAR':
		value = 1
	elif dataSeasonalAdjustment == 'NSA':
		value = 0
	else:
		raise NotImplementedError('Data Seasonal Adjustment Code not recognized')
	columnName = 'bool_is_seasonally_adjusted'
	sendToDB()

	# Update Data Series Last Updated
	value = EM_util.dtConvert_YYYY_MM_DD_TimetoEpoch(data['last_updated'])
	columnName = 'dt_last_updated_SOURCE'
	sendToDB()

	# Update Data Series First Value
	value = EM_util.dtConvert_YYYY_MM_DDtoEpoch(data['observation_start'])
	columnName = 'dt_earliest_value_SOURCE'
	sendToDB()

	# Update Data Series Last Value
	value = EM_util.dtConvert_YYYY_MM_DDtoEpoch(data['observation_end'])
	columnName = 'dt_latest_value_SOURCE'
	sendToDB()
	
	# Fill Generated Flag (Always False)
	value = 0
	columnName = 'bool_generated_datapoint'
	sendToDB()

	# Update Last Update Date
	columnName = 'dt_last_updated_metadata'
	value = EM_util.dtGetNowAsEpoch()
	sendToDB()

	# Update Information From Units	
	# TEST! TEST! TEST! TEST! TEST! TEST! TEST! TEST!
	# TEST! TEST! TEST! TEST! TEST! TEST! TEST! TEST!
	# BELOW HERE WE CHANGE TO NOT ACTUALLY UPDATE DB.
	# TEST! TEST! TEST! TEST! TEST! TEST! TEST! TEST!
	# TEST! TEST! TEST! TEST! TEST! TEST! TEST! TEST!

	# Save Off Data Series Real/Nominal Flag for Later Use
	dataSeriesIsRealValue = None	
	def sendToDB(): print data['title'] + '|' + data['units'] + '|' + columnName + ' : ' + str(value) #TEST
	dataUnits = data['units']
	done = 0
	if dataUnits == 'Percent':
		dataUnitsValue = 'Percent'
		dataTypeValueValue = 'Percent'
		dataSeriesIsRealValue = False #Consider, true if something like unemployment rate, false if bond yield
		done = 1
	elif dataUnits == 'Number':
		dataUnitsValue = 'Number'
		dataTypeValue = 'Number'
		dataSeriesIsRealValue = False #Consider, true?
		done = 1
	elif dataUnits == 'Index':
		dataUnitsValue = 'Number'
		dataTypeValue = 'Index'
		dataSeriesIsRealValue = False
		done = 1
	matchObj = None if done else re.match(of_matchObj, dataUnits)
	if (matchObj is not None):
		dataUnitsValue = matchObj.group(1)
		dataTypeValue = matchObj.group(2)
		innerMatchObj = re.match(ChainedDollars_matchObj,dataTypeValue)
		if (innerMatchObj is not None):
			dataSeriesIsRealValue = True
			dataTypeValue = 'Dollars'
		else:
			dataSeriesIsRealValue = False
		done = 1
	matchObj = None if done else re.match(per_matchObj, dataUnits)
	if (matchObj is not None):
		dataUnitsValue = 'per ' + matchObj.group(2)
		dataTypeValue = matchObj.group(1)
		done = 1
	matchObj = None if done else re.match(IndexEquals_matchObj, dataUnits)
	if (matchObj is not None):
		dataUnitsValue = 'Number'
		dataTypeValue = 'Index'
		done = 1

	if dataSeriesIsRealValue is None:
		pass
		# raise NotImplementedError 
		# dataSeriesIsRealValue = #TODO: regex: Read data series name to determine (if it has the word 'real')
	columnName = 'bool_is_real' 
	value = dataSeriesIsRealValue
	sendToDB()
	columnName = 'bool_is_normalized' #TODO: fill
	value = None
	sendToDB()
	columnName = 'code_private_public' #TODO: fill
	value = None
	sendToDB()	
	columnName = 'code_economic_activity' #TODO: fill
	value = None
	sendToDB()
	columnName = 'code_data_adjustment' #TODO: fill
	value = None
	sendToDB()
	columnName = 'code_sector' #TODO: fill
	value = None
	sendToDB()	
	columnName = 'code_data_units' #TODO: fill
	value = None
	sendToDB()	
	columnName = 'code_item_type' #TODO: fill
	value = None
	sendToDB()
	columnName = 'txt_data_original_source' #TODO: fill
	value = None
	sendToDB()