def setupManiFile(daycode): batchdir = getBatchDir(daycode) if not os.path.exists(batchdir): os.system("mkdir %s" % batchdir) manimap = {} for onex in SynchUtil.getExchanges(): for logtype in SynchUtil.getBigLogList(): sizemap = LogFileDataSize.getHdfsPathList(onex, logtype, daycode) for onefile in sizemap: manimap[onefile] = sizemap[onefile] sizegb = sum(sizemap.values()) / 1000000000 print "Found %d files, %d gb for %s %s %s" % (len(sizemap), sizegb, onex, logtype, daycode) fhandle = open(getManiFilePath(daycode), 'w') for onefile in manimap: fhandle.write(onefile + "\t" + str(manimap[onefile]) + "\n") fhandle.close() print "Finished building manifest, %d total files and %d gb size" % (len(manimap), sum(manimap.values())/1000000000)
def setupManiFile(daycode): batchdir = getBatchDir(daycode) if not os.path.exists(batchdir): os.system("mkdir %s" % batchdir) manimap = {} for onex in SynchUtil.getExchanges(): for logtype in SynchUtil.getBigLogList(): sizemap = LogFileDataSize.getHdfsPathList(onex, logtype, daycode) for onefile in sizemap: manimap[onefile] = sizemap[onefile] sizegb = sum(sizemap.values()) / 1000000000 print "Found %d files, %d gb for %s %s %s" % ( len(sizemap), sizegb, onex, logtype, daycode) fhandle = open(getManiFilePath(daycode), 'w') for onefile in manimap: fhandle.write(onefile + "\t" + str(manimap[onefile]) + "\n") fhandle.close() print "Finished building manifest, %d total files and %d gb size" % ( len(manimap), sum(manimap.values()) / 1000000000)
def checkExchangeDirs(logmail, logtype, daycode): hadcall = "hadoop fs -ls /data/%s/%s/" % (logtype, daycode) hadlines = SynchUtil.sysCallResult(hadcall) warncount = 0 for excname in SynchUtil.getExchanges(): if excname in noDataOkaySet(): continue dirpath = "/data/%s/%s/%s" % (logtype, daycode, excname) if not any(dirpath in hline for hline in hadlines): logmail.addLogLine("Warning: directory not found: %s" % (dirpath)) warncount += 1 continue # no point in continuing subhadls = "hadoop fs -ls %s" % (dirpath) subfiles = SynchUtil.sysCallResult(subhadls) if len(subfiles) < FILE_COUNT_WARN_CUTOFF: logmail.addLogLine("Warning: found only %d files for exchange %s" % (len(subfiles), excname)) warncount += 1 return warncount
def addOneMani(exchange, logtype, daycode, usepref): SynchUtil.writeManiFile(exchange, logtype, daycode, usepref) manipath = SynchUtil.getManiPath(exchange, logtype, daycode) for line in open(manipath): print line,
def runIntUpdate(exchange, logtype, daycode): SynchUtil.writeManiFile(exchange, logtype, daycode, writesize=True) callHadoop(exchange, logtype, daycode) # delete manifest file manipath = SynchUtil.getManiPath(exchange, logtype, daycode) locRmCall = "rm %s" % (manipath) #print "Local rm call is %s" % (locRmCall) os.system(locRmCall)
def getS3sizeMap(adex, logtype, daycode): sizemap = {} s3buck = SynchUtil.s3BucketGrab() s3keyl = SynchUtil.s3KeyList(s3buck, adex, logtype, daycode) for key in s3keyl: simpname = key.name.split('/')[-1] sizemap[simpname] = key.size return sizemap
def logFailureList(): todaycode = SynchUtil.get_today() failpath = "/var/log/cronlogs/hdfs/lzoindexer/nolzolist_%s.txt" % (todaycode) hadJavaCall.runHadoopCall(FINDER_CLASS, [LZO_PATTERN, failpath])
def doSimpleSynch(adex, logtype, daycode): print "Running SimpleSynch for adex=%s, logtype=%s, daycode=%s" % (adex, logtype, daycode) # Generate a manifest file, using appropriate prefix manipath = SynchUtil.writeManiFile(adex, logtype, daycode) hdfsMani = "/tmp/mani/%s" % (manipath) # Put the manifest file on HDFS putCall = "hadoop fs -put %s %s" % (manipath, hdfsMani) print "PutCall: %s" % (hdfsMani) os.system(putCall) # run distcp hdfsDir = "/data/%s/%s/%s/" % (logtype, daycode, adex) distCpCall = "hadoop distcp -f %s %s" % (hdfsMani, hdfsDir) print "DistCpCall: %s" % (distCpCall) os.system(distCpCall) # delete mani file hdfsRmCall = "hadoop fs -rm %s" % (hdfsMani) print hdfsRmCall os.system(hdfsRmCall) # Delete local file locRmCall = "rm %s" % (manipath) print "Local rm call is %s" % (manipath) os.system(locRmCall)
def doSimpleSynch(adex, logtype, daycode): print "Running SimpleSynch for adex=%s, logtype=%s, daycode=%s" % ( adex, logtype, daycode) # Generate a manifest file, using appropriate prefix manipath = SynchUtil.writeManiFile(adex, logtype, daycode) hdfsMani = "/tmp/mani/%s" % (manipath) # Put the manifest file on HDFS putCall = "hadoop fs -put %s %s" % (manipath, hdfsMani) print "PutCall: %s" % (hdfsMani) os.system(putCall) # run distcp hdfsDir = "/data/%s/%s/%s/" % (logtype, daycode, adex) distCpCall = "hadoop distcp -f %s %s" % (hdfsMani, hdfsDir) print "DistCpCall: %s" % (distCpCall) os.system(distCpCall) # delete mani file hdfsRmCall = "hadoop fs -rm %s" % (hdfsMani) print hdfsRmCall os.system(hdfsRmCall) # Delete local file locRmCall = "rm %s" % (manipath) print "Local rm call is %s" % (manipath) os.system(locRmCall)
def callHadoop(exchange, logtype, daycode): manifilepath = SynchUtil.getManiPath(exchange, logtype, daycode) hadoopsys = "hadoop jar %s %s %s" % (SynchUtil.JAR_PATH, INT_UPDATE_CLASS, manifilepath) #print "Hadoop call is %s" % ( hadoopsys ) os.system(hadoopsys)
def getPixelLogPath(pixid, opcode): logdir = "/mnt/data/userindex/%s/" % (SynchUtil.get_today()) if not os.path.exists(logdir): os.system("mkdir %s" % (logdir)) logpath = "%slog_%s_%s.txt" % (logdir, opcode, pixid) return logpath
def cleanHdfsDirs(): cleancall = "hadoop fs -rmr %s/*" % (HDFS_ADA_DIR) print "Cleaning adaclass dir with command %s" % (cleancall) if SynchUtil.promptOkay("Okay to delete? "): os.system(cleancall) else: print "Okay, quitting" sys.exit(1)
def getNfsSizeMap(adex, logtype, daycode): sizeMap = {} nfsdirpath = SynchUtil.getNfsDirPath(adex, logtype, daycode) for filename in os.listdir(nfsdirpath): if filename.find('.gz') == -1: continue filesize = os.path.getsize(nfsdirpath + "/" + filename) sizeMap[filename] = filesize return sizeMap
def createHdfsDirs(pixset): extantdirs = SynchUtil.sysCallResult("hadoop fs -ls %s" % HDFS_ADA_DIR) for onepix in pixset: pixpref = pixprefFromDir(onepix) if any([pixpref in exline for exline in extantdirs]): print "Found directory %s" % (pixpref) else: hadmkdir = "hadoop fs -mkdir %s/%s" % (HDFS_ADA_DIR, pixpref) print "Mkdir call is %s" % (hadmkdir) os.system(hadmkdir)
def getHDsizeMap(adex, logtype, daycode): sizeMap = {} hadlscall = "hadoop fs -ls /data/%s/%s/%s/" % (logtype, daycode, adex) hadooplines = SynchUtil.sysCallResult(hadlscall) for line in hadooplines: toks = line.split() if line.find('.gz') == -1: continue filename = toks[7].split('/')[-1] filesize = int(toks[4]) sizeMap[filename] = filesize return sizeMap
for missfile in misslist: print "File is MISSING: %s" % (missfile) for sizefile in sizelist: print "File %s \n\t has incorrect size of %d, should be %d" % ( sizefile, bMap[sizefile], aMap[sizefile]) if __name__ == "__main__": if not len(sys.argv) == 4: print "Usage: SimpleSynch <all|adex> <big|mini|comp|logtype> <yest|daycode>" sys.exit(1) exclist = SynchUtil.getCheckExcList(sys.argv[1]) loglist = SynchUtil.getCheckLogList(sys.argv[2]) daylist = SynchUtil.getCheckDayList(sys.argv[3]) # need to make sure we're running in a folder where the .mani files # can be written os.chdir('/mnt/src/cronjobs/') for onex in exclist: # DBH has no Big Data logs if onex == 'dbh': continue for logtype in loglist: for daycode in daylist:
def runTrackUpdate(daycode): hadoopsys = "hadoop jar %s %s %s" % (SynchUtil.JAR_PATH, UPDATE_TRACK_CLASS, daycode) print "Hadoop call is %s" % (hadoopsys) os.system(hadoopsys) if __name__ == "__main__": """ This is a one-time operation to copy the impression logs and update the tracking file """ exclist = SynchUtil.getCheckExcList('all') daylist = SynchUtil.getCheckDayList(sys.argv[1]) logtype = 'imp' for daycode in daylist: for onex in exclist: print "Uploading logs for %s %s %s" % (onex, logtype, daycode) ConcatLzoSynch.runLogSync(onex, logtype, daycode) ConcatLzoSynch.runIndexer() # Now we have indexed LZO files, so we can run Java UpdateTrackFile runTrackUpdate(daycode)
# Got rid of all the Mani-file nonsense #SynchUtil.writeManiFile(exchange, logtype, daycode, writesize=True) callHadoop(exchange, logtype, daycode) if __name__ == "__main__": if not len(sys.argv) == 4: print "Usage: ConcatLzoSynch <all|adex> <big|mini|comp|logtype> <yest|daycode>" sys.exit(1) # need to make sure we're running in a folder where we have write permissions, # otherwise we won't be able to write the manifest file os.chdir('/local/src/cronjobs/') exclist = SynchUtil.getCheckExcList(sys.argv[1]) loglist = SynchUtil.getCheckLogList(sys.argv[2]) daylist = SynchUtil.getCheckDayList(sys.argv[3]) idxlist = [] for onex in exclist: for logtype in loglist: for daycode in daylist: # This is now done by Java code ##if not SynchUtil.nfsFilesExist(onex, logtype, daycode): # print "No NFS files for %s %s %s" % (onex, logtype, daycode) # continue print "Running ConcatLogSynch for %s %s %s" % (onex, logtype, daycode)
os.system(hadcall) print "... done" if __name__ == "__main__": daylist = [] if len(sys.argv) < 2: print "Usage UploadExelate.py <daycode|daylist.txt>" sys.exit(1) singarg = sys.argv[1] if singarg.endswith(".txt"): gimp = [daylist.append(oneday.strip()) for oneday in open(singarg)] elif singarg == "yest": daylist.append(SynchUtil.get_yesterday()) else: daylist.append(singarg) for daycode in daylist: grabExelateFile(daycode) upload2hdfs(daycode)
# TODO: change this to use LocalConf hadcall = "hadoop jar /local/bin/jars/adnetik.jar %s %s %s" % (dmclass, daycode, tempfile) print "\nHadoop call is : \n\t%s" % (hadcall) os.system(hadcall) #print "\nFinished with %s" % (dmclass) if __name__ == "__main__": if not len(sys.argv) == 2: print "Usage: BmUpdate <yest|daycode>" sys.exit(1) daylist = SynchUtil.getCheckDayList(sys.argv[1]) daycode = daylist[0] # This is kind of the hacky way to do things sys.path.append("/local/src/python/util") import SimpleMail logmail = SimpleMail.SimpleMail("DbStageSlice") javalist = [] # TODO: going to roll all of this into a single Java file StagingInfoManager, obviate this Python script #javalist.append("com.adnetik.data_management.Special2Staging") javalist.append("com.adnetik.userindex.StagingInfoManager") #javalist.append("com.adnetik.data_management.Click2Staging") #javalist.append("com.adnetik.data_management.Negative2Staging")
# delete manifest file manipath = SynchUtil.getManiPath(exchange, logtype, daycode) locRmCall = "rm %s" % (manipath) #print "Local rm call is %s" % (locRmCall) os.system(locRmCall) if __name__ == "__main__": if not len(sys.argv) == 4: print "Usage: InterestUpdateWrapper <all|adex> <big|mini|comp|logtype> <yest|daycode|filename>" sys.exit(1) # need to make sure we're running in a folder where we have write permissions, # otherwise we won't be able to write the manifest file os.chdir('/mnt/src/cronjobs/') exclist = SynchUtil.getCheckExcList(sys.argv[1]) loglist = SynchUtil.getCheckLogList(sys.argv[2]) daylist = SynchUtil.getCheckDayList(sys.argv[3]) for onex in exclist: for logtype in loglist: for daycode in daylist: runIntUpdate(onex, logtype, daycode)
print "... done" if __name__ == "__main__": daylist = [] if len(sys.argv) < 2: print "Usage UploadExelate.py <daycode|daylist.txt>" sys.exit(1) singarg = sys.argv[1] if singarg.endswith(".txt"): gimp = [daylist.append(oneday.strip()) for oneday in open(singarg)] elif singarg == "yest": daylist.append(SynchUtil.get_yesterday()) else: daylist.append(singarg) for daycode in daylist: grabExelateFile(daycode) upload2hdfs(daycode) #pc_set = set() #for x in targlist: # pc_set.add(x) # pc_set.add(pix_comp_map[x]) # #for onepix in pc_set:
# Got rid of all the Mani-file nonsense #SynchUtil.writeManiFile(exchange, logtype, daycode, writesize=True) callHadoop(exchange, logtype, daycode) if __name__ == "__main__": if not len(sys.argv) == 4: print "Usage: ConcatLzoSynch <all|adex> <big|mini|comp|logtype> <yest|daycode>" sys.exit(1) # need to make sure we're running in a folder where we have write permissions, # otherwise we won't be able to write the manifest file os.chdir('/local/src/cronjobs/') exclist = SynchUtil.getCheckExcList(sys.argv[1]) loglist = SynchUtil.getCheckLogList(sys.argv[2]) daylist = SynchUtil.getCheckDayList(sys.argv[3]) idxlist = [] for onex in exclist: for logtype in loglist: for daycode in daylist: # This is now done by Java code ##if not SynchUtil.nfsFilesExist(onex, logtype, daycode): # print "No NFS files for %s %s %s" % (onex, logtype, daycode) # continue print "Running ConcatLogSynch for %s %s %s" % (onex, logtype,
# need to make sure we're running in a folder where we have write permissions, # otherwise we won't be able to write the manifest file os.chdir('/mnt/src/cronjobs/') exclist = [] daylist = [] loglist = [] for line in sys.stdin: if len(line.strip().split('\t')) < 3: continue (excCode, logType, dayCode) = line.strip().split('\t') #print "Syncing %s %s %s" % (excCode, logType, dayCode) exclist.append(SynchUtil.getCheckExcList(excCode)[0]) loglist.append(SynchUtil.getCheckLogList(logType)[0]) daylist.append(SynchUtil.getCheckDayList(dayCode)[0]) idxlist = [] for i in range(len(exclist)): print "Syncing %s %s %s" % (exclist[i], loglist[i], daylist[i]) ConcatLzoSynch.runLogSync(exclist[i], loglist[i], daylist[i]) idxlist.add(SynchUtil.getHdfsPath(exclist[i], loglist[i], daylist[i])) for toidx in idxlist: ConcatLzoSynch.runIndexer(toidx)
os.system(hdfsRmCall) # Delete local file locRmCall = "rm %s" % (manipath) print "Local rm call is %s" % (manipath) os.system(locRmCall) if __name__ == "__main__": if not len(sys.argv) == 4: print "Usage: SimpleSynch <all|adex> <big|mini|comp|logtype> <yest|daycode>" sys.exit(1) exclist = SynchUtil.getCheckExcList(sys.argv[1]) loglist = SynchUtil.getCheckLogList(sys.argv[2]) daylist = SynchUtil.getCheckDayList(sys.argv[3]) # need to make sure we're running in a folder where the .mani files # can be written os.chdir('/var/log/cronlogs/hdfs/manifiles/') for onex in exclist: for logtype in loglist: for daycode in daylist: if not SynchUtil.nfsFilesExist(onex, logtype, daycode): print "No NFS files for %s %s %s" % (onex, logtype, daycode) continue
subhadls = "hadoop fs -ls %s" % (dirpath) subfiles = SynchUtil.sysCallResult(subhadls) if len(subfiles) < FILE_COUNT_WARN_CUTOFF: logmail.addLogLine("Warning: found only %d files for exchange %s" % (len(subfiles), excname)) warncount += 1 return warncount if __name__ == "__main__": """ Check to make sure that the daily Synch Jobs ran correctly. If they didn't, send an AdminMail """ daycode = SynchUtil.get_yesterday() if "yest" in sys.argv[1] else sys.argv[1] logmail = SimpleMail.SimpleMail("File Synch Check for %s" % (daycode)) warncount = 0 warncount += checkExchangeDirs(logmail, "no_bid", daycode) warncount += checkExchangeDirs(logmail, "bid_all", daycode) if warncount > 0: logmail.send2admin()
print hdfsRmCall os.system(hdfsRmCall) # Delete local file locRmCall = "rm %s" % (manipath) print "Local rm call is %s" % (manipath) os.system(locRmCall) if __name__ == "__main__": if not len(sys.argv) == 4: print "Usage: SimpleSynch <all|adex> <big|mini|comp|logtype> <yest|daycode>" sys.exit(1) exclist = SynchUtil.getCheckExcList(sys.argv[1]) loglist = SynchUtil.getCheckLogList(sys.argv[2]) daylist = SynchUtil.getCheckDayList(sys.argv[3]) # need to make sure we're running in a folder where the .mani files # can be written os.chdir('/var/log/cronlogs/hdfs/manifiles/') for onex in exclist: for logtype in loglist: for daycode in daylist: if not SynchUtil.nfsFilesExist(onex, logtype, daycode): print "No NFS files for %s %s %s" % (onex, logtype, daycode) continue
continue # no point in continuing subhadls = "hadoop fs -ls %s" % (dirpath) subfiles = SynchUtil.sysCallResult(subhadls) if len(subfiles) < FILE_COUNT_WARN_CUTOFF: logmail.addLogLine("Warning: found only %d files for exchange %s" % (len(subfiles), excname)) warncount += 1 return warncount if __name__ == "__main__": """ Check to make sure that the daily Synch Jobs ran correctly. If they didn't, send an AdminMail """ daycode = SynchUtil.get_yesterday( ) if "yest" in sys.argv[1] else sys.argv[1] logmail = SimpleMail.SimpleMail("File Synch Check for %s" % (daycode)) warncount = 0 warncount += checkExchangeDirs(logmail, "no_bid", daycode) warncount += checkExchangeDirs(logmail, "bid_all", daycode) if warncount > 0: logmail.send2admin()
#!/usr/bin/python ################################################### # Just a wrapper for the two SliceInterest jobs # Want these to run back to back, hard to do without a wrapper ################################################### import os, sys import SynchUtil def runCall(dmclass, daycode): hadcall = "hadoop jar /mnt/jars/adnetik.jar com.adnetik.data_management.%s %s" % (dmclass, daycode) print "\nHadoop call is : \n\t%s" % (hadcall) os.system(hadcall) print "\nFinished with %s" % (dmclass) if __name__ == "__main__": # this is ALWAYS going to run using "yesterday" daycode = SynchUtil.get_yesterday() runCall("SliceInterestActivity", daycode) runCall("SliceInterestSecond", daycode)
os.chdir('/mnt/src/cronjobs/') exclist = [] daylist = [] loglist = [] for line in sys.stdin: if len(line.strip().split('\t')) < 3: continue (excCode, logType, dayCode) = line.strip().split('\t') #print "Syncing %s %s %s" % (excCode, logType, dayCode) exclist.append(SynchUtil.getCheckExcList(excCode)[0]) loglist.append(SynchUtil.getCheckLogList(logType)[0]) daylist.append(SynchUtil.getCheckDayList(dayCode)[0]) idxlist = [] for i in range(len(exclist)): print "Syncing %s %s %s" % (exclist[i], loglist[i], daylist[i]) ConcatLzoSynch.runLogSync(exclist[i], loglist[i], daylist[i]) idxlist.add(SynchUtil.getHdfsPath(exclist[i], loglist[i], daylist[i])) for toidx in idxlist: ConcatLzoSynch.runIndexer(toidx)
hadcall = "hadoop fs -cat %s | gunzip | %s > %s" % (filename, javacall, batchfile) #print hadcall os.system(hadcall) if __name__ == "__main__": if not len(sys.argv) == 3: print "Usage BatchSort.py daycode a/b/c" sys.exit(1) daycode = sys.argv[1] if "yest" == daycode: daycode = SynchUtil.get_yesterday() (proc_id, num_proc, num_batch) = [int(x) for x in sys.argv[2].split("/")] if int(proc_id) == 0: setupManiFile(daycode) # TODO: don't use temp file, instead hadoop cat into another program # that does the scrubbing, and direct output to the batch file. print "proc=%d, num=%d, nbatch=%d" % (proc_id, num_proc, num_batch) manimap = getManiData(daycode) # TODO: prefilter files so that we can print "finished with file 52/652..." targlist = [ onefile for onefile in manimap if ((hash(onefile) % num_batch) % num_proc) == proc_id
for subpixdir in listset: pixpref = pixprefFromDir(subpixdir) hdfsdir = "/userindex/adaclass/%s/" % (pixpref) #rmcall = "hadoop fs -rmr %s hadcall = "hadoop fs -put %s%s/*.ser %s" % (locdir, subpixdir, hdfsdir) print "Upload call is %s" % (hadcall) os.system(hadcall) if __name__ == "__main__": daycode = SynchUtil.get_today() if len(sys.argv) < 2 else sys.argv[1] print "\nCalling for daycode %s" % (daycode) locadadir = "/mnt/data/userindex/%s/" % (daycode) listset = set() for onedir in os.listdir(locadadir): if "adaclass_" in onedir: serfiles = [onefile for onefile in os.listdir(locadadir + "/" + onedir) if ".ser" in onefile] if len(serfiles) > 0: print "Found directory %s with %d serfiles" % (onedir, len(serfiles))
hdfsdir = sys.argv[1] ftypemap = {} for onefile in os.listdir("."): ftoks = onefile.split(".") if not len(ftoks) == 2: continue basename = ftoks[0] ftype = ftoks[1] ftypemap.setdefault(ftype, 0) ftypemap[ftype] += 1 for ftype in ftypemap: print "Found %d files of type %s" % (ftypemap[ftype], ftype) if not SynchUtil.promptOkay("Going to upload files to %s" % (hdfsdir)): sys.exit(1) for onefile in os.listdir("."): upcall = "hadoop fs -put %s %s" % (onefile, hdfsdir) print "Upload call is %s" % (upcall) os.system(upcall)
def runTrackUpdate(daycode): hadoopsys = "hadoop jar %s %s %s" % (SynchUtil.JAR_PATH, UPDATE_TRACK_CLASS, daycode) print "Hadoop call is %s" % ( hadoopsys ) os.system(hadoopsys) if __name__ == "__main__": """ This is a one-time operation to copy the impression logs and update the tracking file """ exclist = SynchUtil.getCheckExcList('all') daylist = SynchUtil.getCheckDayList(sys.argv[1]) logtype = 'imp' for daycode in daylist: for onex in exclist: print "Uploading logs for %s %s %s" % (onex, logtype, daycode) ConcatLzoSynch.runLogSync(onex, logtype, daycode) ConcatLzoSynch.runIndexer() # Now we have indexed LZO files, so we can run Java UpdateTrackFile runTrackUpdate(daycode)