예제 #1
0
import CONSTANTS
import shutil
import os
import bigdataUtilities as util

if __name__ == "__main__":

    util.obnoxiousPrint(
        "Ensure the file 'final_output.csv' is in the user's HDFS directory.")
    util.obnoxiousPrint("See the README.md file for details.")

    util.obnoxiousPrint("Checking project dependencies.")
    util.subprocessCall(["lein", "deps"])
    util.obnoxiousPrint("Creating uberjar.")
    util.subprocessCall(["lein", "uberjar"])
    util.obnoxiousPrint("Running Cascalog correlation.")
    util.subprocessCall([
        "hadoop", "jar", "target/cascalog-dwell-0.1.0-SNAPSHOT-standalone.jar",
        "clojure.main"
    ])

    util.obnoxiousPrint("Copying results from HDFS to local output directory.")
    try:
        shutil.rmtree("output")
    except:
        pass
    os.mkdir("output")

    paths = util.lsrHdfsDir(CONSTANTS.CASCALOG_HDFS_PATH)
    fobj = open("output/sorted_out", "w")
    for path in paths:
예제 #2
0
import CONSTANTS
import shutil
import os
import bigdataUtilities as util

if __name__ == "__main__":
    util.obnoxiousPrint("Cleaning up HDFS data.")
    util.subprocessCall(["hadoop", "fs", "-rm", CONSTANTS.CASCALOG_HDFS_PATH + '/\*'], False)

    fobj = open("__temp_file", 'w')
    util.hiveScript("select * from vast_time_series_data", stdout=fobj)
    fobj.close()
    
    util.obnoxiousPrint("Moving data to Cascalog readable location.")
    util.subprocessCall(["hadoop", "fs", "-copyFromLocal", "__temp_file",
                         "vast_time_series_data.tsv"])
   
    util.obnoxiousPrint("Checking project dependencies.") 
    util.subprocessCall(["lein", "deps"])
    util.obnoxiousPrint("Creating uberjar.") 
    util.subprocessCall(["lein", "uberjar"])
    util.obnoxiousPrint("Running Cascalog correlation.") 
    util.subprocessCall(["hadoop", "jar", "target/cascalog-corr-0.1.0-SNAPSHOT-standalone.jar",
                         "clojure.main"])
    
    util.obnoxiousPrint("Copying results from HDFS to local output directory.")
    try:
        shutil.rmtree("output")
    except:
        pass
    os.mkdir("output")
예제 #3
0
파일: Driver.py 프로젝트: bm3719/practice
import CONSTANTS
import shutil
import os
import bigdataUtilities as util

if __name__ == "__main__":

    util.obnoxiousPrint("Ensure the file 'final_output.csv' is in the user's HDFS directory.")
    util.obnoxiousPrint("See the README.md file for details.")
   
    util.obnoxiousPrint("Checking project dependencies.") 
    util.subprocessCall(["lein", "deps"])
    util.obnoxiousPrint("Creating uberjar.") 
    util.subprocessCall(["lein", "uberjar"])
    util.obnoxiousPrint("Running Cascalog correlation.") 
    util.subprocessCall(["hadoop", "jar", "target/cascalog-dwell-0.1.0-SNAPSHOT-standalone.jar",
                         "clojure.main"])
    
    util.obnoxiousPrint("Copying results from HDFS to local output directory.")
    try:
        shutil.rmtree("output")
    except:
        pass
    os.mkdir("output")

    paths = util.lsrHdfsDir(CONSTANTS.CASCALOG_HDFS_PATH)
    fobj = open("output/sorted_out","w")
    for path in paths:
        if "ais-dwell-out/part-" in path:
            util.subprocessCall(["hadoop", "fs", "-cat", path], stdout=fobj)
    
예제 #4
0
if __name__ == "__main__":

    # base directory in HDFS.  Make sure you're able to write/delete in this directory
    base_dir = "/analytics/leaf-compression"
    version = "-0.1-SNAPSHOT"

    # Comma seperated list of zookeeper nodes in ip:port format.  2181 is the default port.    
    zkList = "localhost:2181"

    slaves = 1 
    if len(sys.argv) > 1:
        slaves = int(sys.argv[1])

    util.obnoxiousPrint("Uploading data to hdfs")
    util.subprocessCall(["hadoop","fs","-rmr", base_dir + "/input"],False)
    util.subprocessCall(["hadoop","fs","-rmr", base_dir + "/output/leaf"],False)
    util.subprocessCall(["hadoop","fs","-mkdir","-p", base_dir + "/input"],False)
    util.subprocessCall(["hadoop","fs","-copyFromLocal","example_data.csv",base_dir + "/input/small_graph"])

    util.obnoxiousPrint("Running leafcompress job")
    util.subprocessCall(["hadoop","jar","target/leaf-compression"+version+"-jar-with-dependencies.jar",
                    "org.apache.giraph.GiraphRunner",
                    "-Dgiraph.zkList=" + zkList,
                    "com.soteradefense.bsp.KeyDataVertex",
                    "-w", str(slaves),
                    "-vif", "com.soteradefense.bsp.KeyDataVertexInputFormat",
                    "-of", "com.soteradefense.bsp.KeyDataVertexOutputFormat",
                    "-vip", base_dir + "/input/small_graph",
                    "-op", base_dir + "/output/leaf"])