def _getDeltas(filepath, source): localDir=config.load_source_config(source)["local_dir"] lastFilepath=database.getLastProcessedFile(source) if lastFilepath is not None: lastFilepath="/".join((os.environ["DATA_DIR"],localDir,lastFilepath)) (lastData,lastArchive,lastStartDate,lastBornMillis)=_parseFile(lastFilepath) currentData,currentArchive,currentStartDate,currentBornMillis=_parseFile(filepath) assert (lastArchive is None and currentArchive is True) or currentArchive is False assert (currentStartDate>=lastStartDate) #get the data that need to be killed. these are data that were in the previous file, but not in #the current. The death time can be the timestamp in any item in currentData, since, except for #the very first archive file, all data should have the same timestamp remove={} for entry,annDate in lastData.iteritems(): if annDate<currentStartDate: #entry[1] is the annMillis continue if entry not in currentData: remove[entry]=currentBornMillis #get the data that need be inserted. similar to above insert={} for entry,annDate in currentData.iteritems(): if entry not in lastData: insert[entry]=currentBornMillis return insert,remove
def _getDeltas(filepath, source): localDir = config.load_source_config(source)["local_dir"] lastFilepath = database.getLastProcessedFile(source) if lastFilepath is not None: lastFilepath = "/".join((os.environ["DATA_DIR"], localDir, lastFilepath)) lastData, lastStartDate, lastBornMillis = _parseFile(lastFilepath) currentData, currentStartDate, currentBornMillis = _parseFile(filepath) assert (currentStartDate >= lastStartDate) remove = () for lastDatum in lastData: if lastDatum[2] < currentStartDate: #entry[1] is the annMillis continue if lastDatum not in currentData: remove.add(lastDatum) remove[lastDatum] = currentBornMillis #get the data that need be inserted. similar to above insert = set() for currentDatum in currentData: if currentDatum not in lastData: insert.add(currentDatum) return insert, remove, currentBornMillis
def __getDeltas(filepath, source, treatFutureData=False): ######get the previously processed file########## localDir = config.load_source_config(source)["local_dir"] previousData = set() previousFileName = database.getLastProcessedFile(source) if previousFileName is not None: previousFileName = os.environ[ "DATA_DIR"] + "/" + localDir + "/" + previousFileName previousFileInfo = datafiles.read_info_file(previousFileName) previousFileDate = previousFileInfo["date_first_present"] firstLoad = False zf = zipfile.ZipFile(previousFileName) names = zf.namelist() assert len(names) == 1 file = zf.open(names[0]) #skip header file.readline() for line in file: if treatFutureData: effectiveDate = line.strip().split("|")[4] effectiveDate = dateutil.parser.parse(effectiveDate + " 00:00:00.000000 UTC") if effectiveDate < previousFileDate: previousData.add(line) else: previousData.add(line) file.close() zf.close() else: firstLoad = True ##########get deltas from previous file############# currentData = set() zf = zipfile.ZipFile(filepath) names = zf.namelist() assert len(names) == 1 file = zf.open(names[0]) #skip header file.readline() for line in file: currentData.add(line) file.close() zf.close() newData = currentData - previousData removedData = previousData - currentData return (newData, removedData, firstLoad)
def getPositions(self, date): PositionSource.getPositions(self, date) #Read from the morgan data source source = "morgan_positions" sourceConfig = config.load_source_config(source) dataPath = "{dataPath}/{localDir}/".format( dataPath=os.environ["DATA_DIR"], localDir=sourceConfig["local_dir"]) fileRegex = "Tower_Positions\\.{}\\.0000\\.txt".format( date.strftime("%d%m%y")) fileRegex += r'\.[a-f0-9]{8}$' fileRegex = re.compile(fileRegex) #get subdirs that could contain the desired file (after the date) def candidateSubDir(dir): if not os.path.isdir(dataPath + dir): return False try: dirDate = datetime.datetime.strptime(dir, "%Y%m%d") except ValueError: return False return dirDate > date candidateSubDirs = filter(candidateSubDir, os.listdir(dataPath)) candidateSubDirs.sort() #Sort for efficiency #Find file with positions positionFile = None for subDir in candidateSubDirs: if positionFile != None: break path = dataPath + subDir for file in os.listdir(path): if fileRegex.match(file): positionFile = dataPath + subDir + "/" + file break if positionFile == None: raise PositionSourceError( "MorganStanleyPosition found no matching files") return self._readPositionsFromFile(positionFile)
# Check for previously running instance if not database.getProcessedFilesLock(): util.warning("Not processing, previous instance running") sys.exit(1) #XXX may want to precache seen files for speed in loading try: for source in options.source.split("+"): util.info("Processing source %s" % source) from data_sources.file_source import FileSource util.info("Indexing new files for %s" % source) fs = FileSource() files = [] sconfig = config.load_source_config(source) # List files fs.cwd("%s/%s" % (os.environ['DATA_DIR'], sconfig['local_dir'])) if (options.files is not None): files_regex = options.files elif sconfig["exec_regex"] is not None: files_regex = sconfig["exec_regex"] else: files_regex = sconfig["local_regex"] listing = fs.list_recursive(files_regex + "\.info", sizes=False) # Load set of seen files util.info("Fetching processed files for %s" % source) seen = database.getProcessedFiles(source)
parser.add_option("-m", "--maxfiles", default=100, dest="maxfiles") parser.add_option("-a", "--maxage", default=5, dest="maxage") parser.add_option("-d", "--debug", default=False, action="store_true", dest="debug") (options, args) = parser.parse_args() if options.debug: util.set_debug() else: util.set_log_file(options.source, True) lock_f = util.lock(options.source) #Create a lock config = config.load_source_config( options.source) #Load config file for source time_file = "%s/%s/%s.time" % (os.environ["DATA_DIR"], config["local_dir"], options.source) util.info("Acquiring data from %s" % options.source) try: # Read last check time try: last_time = cPickle.load(open(time_file, 'rb')) except IOError: last_time = "" t = random.random() * 15 time.sleep(t) util.info("Checking (after waiting %ds)" % t)
def checkAcquireTimestampsAndNewFiles( dataCheckFrequency=1L * 24L * 60L * 60L * 1000L, defaultNewDataFrequency=1L * 24L * 60L * 60L * 1000L): sourceConfigDir = os.environ["CONFIG_DIR"] + "/sources" #sourceConfigDir="/apps/logs/ase/config/sources" errors = [] warnings = [] normal = [] for sourceConfigFile in os.listdir(sourceConfigDir): if sourceConfigFile[-3:] != ".py": continue sourceName = sourceConfigFile[:-3] sc = config.load_source_config(sourceName) #Check major errors first try: #sourceLocalDir=os.environ["DATA_DIR"]+"/"+sc["local_dir"] sourceLocalDir = os.environ["DATA_DIR"] + "/" + sc["local_dir"] except KeyError: #no local dir, old format #raise continue if not os.path.exists(sourceLocalDir): errors.append("{}: Never checked".format(sourceName)) continue #get time file timeFile = sourceLocalDir + "/" + sourceName + ".time"
file.close() #######MAPPING VERIFICATION CODE######## if verifyOnly: return inconcistentMappings ######################################## if __name__ == "__main__": #ammend barra data and add the INDNAME values newdb.init_db() database = newdb.get_db() #collect all the files processed so far processedFiles = database.getProcessedFilesTimeOrdered("barra") localDir = config.load_source_config("barra")["local_dir"] try: i = 0 for file in processedFiles: database.start_transaction() print datetime.datetime.now(), file process("/".join((os.environ["DATA_DIR"], localDir, file)), "barra") database.commit() except Exception, e: print e database.rollback() else: pass database.commit()
toDate = fromDate + dayDelta elif args.fromDate is not None and args.toDate is not None: fromDate = datetime.datetime.strptime(args.fromDate, "%Y%m%d") toDate = datetime.datetime.strptime(args.toDate, "%Y%m%d") elif args.all is True: fromDate = datetime.datetime.strptime("20090910", "%Y%m%d") toDate = datetime.datetime.utcnow() toDate = datetime.datetime.strptime(toDate.strftime("%Y%m%d"), "%Y%m%d") #Get only date toDate += dayDelta else: parser.print_help(util.LOGFILE) exit(1) #load a relevant config file to get password sconfig = config.load_source_config("htb") source = SFTPSource(sconfig["host"], sconfig["user"], sconfig["pass"]) date = fromDate while date < toDate: try: if args.fills: remoteDir = "/spare/local/ase/trade/run/live-prod/" + date.strftime( "%Y/%m/%d") localDir = args.location + "/" + date.strftime("%Y%m%d") source.cwd(remoteDir) files = source.list(r".*fills.txt") for file in files: source.copy(remoteDir + "/" + file[0], localDir + "/" + "old." + file[0]) if args.calcres: