Python getの例、pydoop.hdfs.get Pythonの例

コード例 #1

0

ファイルを表示

ファイル: spark-analysis.py プロジェクト: rpki-study/rpki-study.github.io

def getMergeAndSort(savePath, localPath, label=None):
    tmp_path = "/tmp/tmp-spark"
    try:
        os.mkdirs(tmp_path)
    except:
        pass

    hdfs.get(savePath, tmp_path)

    cmdMerge = """find %s -name "*" -print0 | xargs -0 cat >> /tmp/tmp-spark""" % tmp_path
    print cmdMerge
    os.system(cmdMerge)

    cmdSort = "sort -k1,1 /tmp/tmp-spark > {0}".format(
        os.path.join(localPath,
                     localPath.split("/")[-1] + ".tsv"))

    os.system(cmdSort)

    if (label is not None):
        cmd = "sed -i '1s/^/%s\\n/' %s" % (label, path + ".tsv")
        print cmd
        os.system(cmd)

    cmdErase = "rm /tmp/tmp-spark"
    os.system(cmdErase)

    try:
        shutil.rmtree(tmp_path)
    except:
        pass

コード例 #2

0

ファイルを表示

ファイル: hdfs.py プロジェクト: Tha-Robert/hops-util-py

def copy_to_local(hdfs_path, local_path, overwrite=False, project=None):
    """
    Copies a path from HDFS project to local filesystem

    Args:
        :local_path: the path on the local filesystem to copy
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :overwrite: a boolean flag whether to overwrite if the path already exists in HDFS
        :project: name of the project, defaults to the current HDFS user's project
    """
    if project == None:
        project = project_name()

    if "PDIR" in os.environ:
        full_local = os.environ['PDIR'] + '/' + local_path
    else:
        full_local = os.getcwd() + '/' + local_path

    project_hdfs_path = _expand_path(hdfs_path, project=project)
    sub_path = hdfs_path.find("hdfs:///Projects/" + project)
    rel_path = hdfs_path[sub_path + 1:]

    if overwrite:
        split = rel_path.split('/')
        filename = split[len(split) - 1]
        full_local_path = full_local + '/' + filename
        if os.path.isdir(full_local_path):
            shutil.rmtree(full_local_path)
        elif os.path.isfile(full_local_path):
            os.remove(full_local_path)

    hdfs.get(project_hdfs_path, full_local)

コード例 #3

0

ファイルを表示

ファイル: seal_integration_test.py プロジェクト: pinno/seal

    def test_method(self):
        self.logger.info( ('-'*20 + " %s " + '-'*20), self.test_name)
        self.setup()

        success = False
        try:
            self.logger.info("running %s program", self.test_name)
            self.run_program(self.make_hdfs_input_path(), self.make_hdfs_output_path())

            self.logger.info("now going to process output")
            self.logger.debug("hdfs.get(%s, %s)", self.make_hdfs_output_path(), self.output_dir)
            hdfs.get(self.make_hdfs_output_path(), self.output_dir)
            self.process_output()
            success = True
        except Exception as e:
            self.logger.error("*"*72)
            self.logger.error("Test %s raised an exception" % self.test_name)
            self.logger.error(e)
            self.logger.error("*"*72)
        finally:
            self.logger.info("cleaning up")
            self.clean_up()
            self.logger.info( '-'*(42 + len(self.test_name)) ) # close the test section with a horizontal line
            self.show_test_msg(success)

        return success

コード例 #4

0

ファイルを表示

ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop

 def get(self):
   src = self.hdfs_paths[0]
   dest = hdfs.path.split(self.local_paths[0])[-1]
   hdfs.dump(self.data, src)
   hdfs.get(src, dest)
   with open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)

コード例 #5

0

ファイルを表示

 def get(self):
     src = self.hdfs_paths[0]
     dest = hdfs.path.split(self.local_paths[0])[-1]
     hdfs.dump(self.data, src, mode="wb")
     hdfs.get(src, dest, mode="wb")
     with open(dest, 'rb') as fi:
         rdata = fi.read()
     self.assertEqual(rdata, self.data)

コード例 #6

0

ファイルを表示

ファイル: spark-rpki-object-validation.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkNumASesInROAs(sc, ip_type):

    roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*"

    savePath = "/hdfs-to-local-path/rpki/results/roas-covering-AScnt-%s" % ip_type
    localPath = "/home/tjchung/research/rpki/src/spark/results/roas-covering-AScnt-%s" % ip_type

    try:
        hdfs.rmr(savePath)
    except:
        pass

    tals = [
        "apnic", "apnic-iana", "apnic-afrinic", "apnic-arin", "apnic-lacnic",
        "apnic-ripe", "lacnic", "ripencc", "arin", "afrinic", "localcert"
    ]

    k  = sc.textFile(roa_prefix_asn)\
        .filter(lambda line: "#" not in line)\
        .map(lambda line: line.rstrip().split("\t"))\
        .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type))\
        .distinct()\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time, tal), asID))\
        .groupByKey()\
        .map(lambda ( (time, tal), num_ases): (time, tal, len(set(num_ases))))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'tal', 'num_ASes'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.tal, row.num_ASes)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, 0) for k in ["date"] + tals})

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row["apnic"], row["apnic-iana"], row["apnic-afrinic"], row["apnic-arin"], row["apnic-lacnic"], row["apnic-ripe"], row["lacnic"], row["ripencc"], row["arin"], row["afrinic"], row["localcert"]))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #7

0

ファイルを表示

 def copy_from_hdfs_to_local(self,src_hdfs_location,dest_local_location="../output/"):
     if src_hdfs_location=="":
         print "No source specified"
         return False
     elif self.handle.exists(src_hdfs_location)==False:
         print "File does not exist"
         return False
     hdfs.get(src_hdfs_location,dest_local_location)
     return True

コード例 #8

0

ファイルを表示

def copy_to_local(hdfs_path, local_path, overwrite=False, project=None):
    """
    Copies a path from HDFS project to local filesystem. If there is not enough space on the local scratch directory, an exception is thrown.

    Raises:
      IOError if there is not enough space to localize the file/directory in HDFS to the scratch directory ($PDIR)

    Args:
        :local_path: the path on the local filesystem to copy
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :overwrite: a boolean flag whether to overwrite if the path already exists in HDFS
        :project: name of the project, defaults to the current HDFS user's project

    Returns:
        the full local pathname of the file/dir
    """
    import os
    import pydoop.hdfs.path as path

    if project == None:
        project = project_name()

    if "PDIR" in os.environ:
        full_local = os.environ['PDIR'] + '/' + local_path
    else:
        full_local = os.getcwd() + '/' + local_path

    project_hdfs_path = _expand_path(hdfs_path, project=project)
    sub_path = hdfs_path.find("hdfs:///Projects/" + project)
    rel_path = hdfs_path[sub_path + 1:]

    # Get the amount of free space on the local drive
    stat = os.statvfs(full_local)
    free_space_bytes = stat.f_bsize * stat.f_bavail

    hdfs_size = path.getsize(project_hdfs_path)

    if (hdfs_size > free_space_bytes):
        raise IOError(
            "Not enough local free space available on scratch directory: %s" %
            path)

    if overwrite:
        split = rel_path.split('/')
        filename = split[len(split) - 1]
        full_local_path = full_local + '/' + filename
        if os.path.isdir(full_local_path):
            shutil.rmtree(full_local_path)
        elif os.path.isfile(full_local_path):
            os.remove(full_local_path)

    hdfs.get(project_hdfs_path, full_local)

    return full_local

コード例 #9

0

ファイルを表示

ファイル: spark-rpki-object-validation.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkNumPrefixWithMaxlen(sc, ip_type="ipv4"):

    roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*"
    localPath = "/home/tjchung/research/rpki/src/spark/results/roa-prefix-with-maxlength"
    savePath = "/hdfs-to-local-path/rpki/results/roa-prefix-with-maxlength"

    try:
        hdfs.rmr(savePath)
    except:
        pass
    k  = sc.textFile(roa_prefix_asn)\
        .filter(lambda line: "#" not in line)\
        .map(lambda line: line.rstrip().split("\t"))\
        .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type) )\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal))\
        .distinct()\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time,  str(int( (prefix_len != maxlen) and maxlen != "None" ))), 1))\
        .reduceByKey(lambda a, b: a+ b)\
        .map(lambda ((time, hasMaxlen), cnt): (time, hasMaxlen, cnt))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'hasMaxlen', 'cnt'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.hasMaxlen, row.cnt)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, 0)
                      for k in ["date", "0", "1"]})  # 1 means has a maxlen

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row['0'], row['1']))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #10

0

ファイルを表示

ファイル: spark-analysis.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkClassifyHijackingUniquePrefixDuration(sc, dataset, ip_type):
    readPath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-%s/%s" % (
        ip_type, dataset)
    savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-classify-hijack-duration-%s/%s" % (
        ip_type, dataset)
    localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-unique-prefix-classify-hijack-duration-%s/%s" % (
        ip_type, dataset)

    try:
        hdfs.rmr(savePath)
    except:
        pass


    k = sc.textFile(readPath)\
            .map(lambda v: parseVerifyLineUniquePrefix(v))\
            .filter(lambda v: v is not None)\
            .filter(lambda v: notDataError(dataset, v))\
            .filter(lambda v: isIPv4v6(v, ip_type))\
            .filter(lambda v: classifyBGPAdvSparse(v) == "rpki-invalid")\
            .filter(lambda v: ip_type == "ipv6" or not isLargerSlash24(v))\
            .filter(lambda v: onlyHijackAttempt(v))\
            .map(lambda v: ( (classifyHijack(v), v['prefix_addr'], v['prefix_len'], v['origin_as']), v['time']))\
            .groupByKey()\
            .map(lambda ((classifyHijack, prefix_addr, prefix_len, origin), list_of_time): (classifyHijack, prefix_addr, prefix_len, origin, len(set(list_of_time))))\
            .map(toTSV)\
            .saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #11

0

ファイルを表示

ファイル: spark-analysis.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkClassifyHijackingUniquePrefixList(sc, dataset, ip_type):
    readPath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-%s/%s" % (
        ip_type, dataset)
    savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-classify-hijack-list-%s/%s" % (
        ip_type, dataset)
    localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-unique-prefix-classify-hijack-list-%s/%s" % (
        ip_type, dataset)

    try:
        hdfs.rmr(savePath)
    except:
        pass


    k = sc.textFile(readPath)\
            .map(lambda v: parseVerifyLineUniquePrefix(v))\
            .filter(lambda v: v is not None)\
            .filter(lambda v: notDataError(dataset, v))\
            .filter(lambda v: isIPv4v6(v, ip_type))\
            .filter(lambda v: classifyBGPAdvSparse(v) == "rpki-invalid")\
            .filter(lambda v: ip_type == "ipv6" or not isLargerSlash24(v))\
            .filter(lambda v: onlyHijackAttempt(v))\
            .map(lambda v: ((v['time'], classifyHijack(v)), json.dumps(v)))\
            .map(toTSV)\
            .saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #12

0

ファイルを表示

ファイル: seal_integration_test.py プロジェクト: ilveroluca/seal

    def test_method(self):
        """
        "main" method
        """
        self.options = self.parser.parse_args()
        if self.options.debug:
            self.logger.setLevel(logging.DEBUG)

        self.logger.info( ('-'*20 + " %s " + '-'*20), self.test_name)
        self.setup()

        self.logger.debug("setup complete")
        success = False
        try:
            self.logger.info("running %s program", self.test_name)
            hdfs_input = self.make_hdfs_input_path()
            hdfs_output = self.make_hdfs_output_path()
            self.logger.debug("hdfs input path: %s", hdfs_input)
            self.logger.debug("hdfs output path: %s", hdfs_output)
            self.run_program(hdfs_input, hdfs_output)

            self.logger.info("now going to process output")
            self.logger.debug("hdfs.get(%s, %s)", self.make_hdfs_output_path(), self.output_dir)
            hdfs.get(self.make_hdfs_output_path(), self.output_dir)
            self.process_output()
            success = True
        except Exception as e:
            self.logger.error("*"*72)
            self.logger.error("Test %s raised an exception" % self.test_name)
            self.logger.error(e)
            self.logger.error("*"*72)
        finally:
            self.logger.info("cleaning up")
            self.clean_up()
            self.logger.info( '-'*(42 + len(self.test_name)) ) # close the test section with a horizontal line
            self.show_test_msg(success)

        return success

コード例 #13

0

ファイルを表示

ファイル: hdfs.py プロジェクト: nihil0/hops-util-py

def copy_to_local(hdfs_path, local_path="", overwrite=False, project=None):
    """
    Copies a directory or file from a HDFS project to a local private scratch directory. If there is not enough space on the local scratch directory, an exception is thrown.
    If the local file exists, and the hdfs file and the local file are the same size in bytes, return 'ok' immediately.
    If the local directory tree exists, and the hdfs subdirectory and the local subdirectory have the same files and directories, return 'ok' immediately.

    For example, if you execute:

    >>> copy_to_local("Resources/my_data")

    This will copy the directory my_data from the Resources dataset in your project to the current working directory on the path ./my_data

    Raises:
      IOError if there is not enough space to localize the file/directory in HDFS to the scratch directory ($PDIR)

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :local_path: the relative or full path to a directory on the local filesystem to copy to (relative to a scratch directory $PDIR), defaults to $CWD
        :overwrite: a boolean flag whether to overwrite if the path already exists in the local scratch directory.
        :project: name of the project, defaults to the current HDFS user's project

    Returns:
        the full local pathname of the file/dir
    """

    if project == None:
        project = project_name()

    if local_path.startswith(os.getcwd()):
        local_dir = local_path
    else:
        local_dir = os.getcwd() + '/' + local_path

    if not os.path.isdir(local_dir):
        raise IOError("You need to supply the path to a local directory. This is not a local dir: %s" % local_dir)

    filename = path.basename(hdfs_path)
    full_local = local_dir + "/" + filename

    project_hdfs_path = _expand_path(hdfs_path, project=project)

    # Get the amount of free space on the local drive
    stat = os.statvfs(local_dir)
    free_space_bytes = stat.f_bsize * stat.f_bavail

    hdfs_size = path.getsize(project_hdfs_path)

    if os.path.isfile(full_local) and not overwrite:
        sz = os.path.getsize(full_local)
        if hdfs_size == sz:
            print("File " + project_hdfs_path + " is already localized, skipping download...")
            return full_local
        else:
            os.remove(full_local)

    if os.path.isdir(full_local) and not overwrite:
        try:
            localized = _is_same_directory(full_local, project_hdfs_path)
            if localized:
                print("Full directory subtree already on local disk and unchanged. Set overwrite=True to force download")
                return full_local
            else:
                shutil.rmtree(full_local)
        except Exception as e:
            print("Failed while checking directory structure to avoid re-downloading dataset, falling back to downloading")
            print(e)
            shutil.rmtree(full_local)

    if hdfs_size > free_space_bytes:
        raise IOError("Not enough local free space available on scratch directory: %s" % local_path)

    if overwrite:
        if os.path.isdir(full_local):
            shutil.rmtree(full_local)
        elif os.path.isfile(full_local):
            os.remove(full_local)

    print("Started copying " + project_hdfs_path + " to local disk on path " + local_dir + "\n")

    hdfs.get(project_hdfs_path, local_dir)

    print("Finished copying\n")

    return full_local

コード例 #14

0

ファイルを表示

ファイル: linkfind.py プロジェクト: rob-berkes/pycraw

import os
from bs4 import BeautifulSoup

conn=happybase.Connection('127.0.0.1')
crawlTbl=conn.table('crawls')

ANET=125

for BNET in range(33,40):
  for CNET in range(0,255):
    for DNET in range(0,255):
      FNAME  =str(ANET)+"."+str(BNET)+"."+str(CNET)+"."+str(DNET)+"_root20150114.htm"
      GETFILE="crawls/"+str(ANET)+"/"+str(BNET)+"/"+FNAME
      print FNAME	
      try:
        hdfs.get(GETFILE,FNAME)
      except IOError:
	print BNET,CNET,DNET
	continue
      soup=BeautifulSoup(open(FNAME,'r'))
      for anchor in soup.find_all('a'):
	link = anchor.get('href')
      for key, data in table.rows([FNAME]):
	print key,data
	break





      FNAME.close()

コード例 #15

0

ファイルを表示

ファイル: view.py プロジェクト: vu-an/hadoop-video-streaming

def index():
	n = 'NongNghiep.mp4'
	d = str(uuid4())

	hdfs.get('/video/{}'.format(n), 'static/{}/{}'.format(d, n))
	return render_template('index.html', video='static/{}/{}'.format(d, n))

コード例 #16

0

ファイルを表示

def copy_to_local(hdfs_path, local_path, overwrite=False, project=None):
    """
    Copies a directory or file from a HDFS project to a local private scratch directory. If there is not enough space on the local scratch directory, an exception is thrown.
    If the local file exists, and the hdfs file and the local file are the same size in bytes, return 'ok' immediately.
    If the local directory tree exists, and the hdfs subdirectory and the local subdirectory have the same files and directories, and the files are the same size in bytes, return 'ok' immediately.

    Raises:
      IOError if there is not enough space to localize the file/directory in HDFS to the scratch directory ($PDIR)

    Args:
        :local_path: the relative or full path to a directory on the local filesystem to copy to (relative to a scratch directory $PDIR)
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :overwrite: a boolean flag whether to overwrite if the path already exists in the local scratch directory.
        :project: name of the project, defaults to the current HDFS user's project

    Returns:
        the full local pathname of the file/dir
    """

    if project == None:
        project = project_name()

    if "PDIR" in os.environ:
        local_dir = os.environ['PDIR'] + '/' + local_path
    else:
        local_dir = os.getcwd() + '/' + local_path

    if os.path.isdir(local_dir) == False:
        raise IOError(
            "You need to supply the path to a local directory. This is not a local dir: %s"
            % local_dir)

    filename = path.basename(hdfs_path)
    full_local = local_dir + "/" + filename

    project_hdfs_path = _expand_path(hdfs_path, project=project)
    sub_path = hdfs_path.find("hdfs:///Projects/" + project)
    rel_path = hdfs_path[sub_path + 1:]

    # Get the amount of free space on the local drive
    stat = os.statvfs(local_dir)
    free_space_bytes = stat.f_bsize * stat.f_bavail

    hdfs_size = path.getsize(project_hdfs_path)

    if os.path.isfile(full_local) and overwrite == False:
        sz = os.path.getsize(full_local)
        if (hdfs_size == sz):
            return full_local

    if os.path.isdir(full_local) and overwrite == False:
        if FsTree().check(full_local, project_hdfs_path) == True:
            print(
                "Full directory subtree already on local disk and unchanged.")
            return full_local

    if (hdfs_size > free_space_bytes):
        raise IOError(
            "Not enough local free space available on scratch directory: %s" %
            path)

    if overwrite:
        if os.path.isdir(full_local):
            shutil.rmtree(full_local)
        elif os.path.isfile(full_local):
            os.remove(full_local)

    hdfs.get(project_hdfs_path, local_dir)

    return full_local

コード例 #17

0

ファイルを表示

import pydoop.hdfs as hdfs
import os
from bs4 import BeautifulSoup

conn = happybase.Connection('127.0.0.1')
crawlTbl = conn.table('crawls')

ANET = 125

for BNET in range(33, 40):
    for CNET in range(0, 255):
        for DNET in range(0, 255):
            FNAME = str(ANET) + "." + str(BNET) + "." + str(CNET) + "." + str(
                DNET) + "_root20150114.htm"
            GETFILE = "crawls/" + str(ANET) + "/" + str(BNET) + "/" + FNAME
            print FNAME
            try:
                hdfs.get(GETFILE, FNAME)
            except IOError:
                print BNET, CNET, DNET
                continue
            soup = BeautifulSoup(open(FNAME, 'r'))
            for anchor in soup.find_all('a'):
                link = anchor.get('href')
            for key, data in table.rows([FNAME]):
                print key, data
                break

            FNAME.close()
            os.remove(FNAME)

コード例 #18

0

ファイルを表示

def fetch(savePath, dstPath, saveDir):
    hdfs.get(savePath, os.path.join(dstPath, saveDir))
    mergeAndSort(os.path.join(dstPath, saveDir))

コード例 #19

0

ファイルを表示

ファイル: spark-rpki-object-validation.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkROAsIPPercentage(sc, year, ip_type):  # only support v4
    def getNumIPs(list_of_ips, ip_type):
        s = []
        for (prefix_addr, prefix_len) in list_of_ips:
            if (ip_type == "ipv4"):
                prefix = IPv4Network("%s/%s" % (prefix_addr, prefix_len))
            else:
                prefix = IPv6Network("%s/%s" % (prefix_addr, prefix_len))
            s.append(prefix)

        return str(sum(map(lambda v: v.num_addresses, collapse_addresses(s))))

    tals = ["apnic", "lacnic", "ripencc", "arin", "afrinic"]

    nro_stats = "/hdfs-to-local-path/rpki/nrostats-withdate/nrostats-%s*-v4.csv" % year
    roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/%s/*" % year
    savePath = "/hdfs-to-local-path/rpki/results/roas-covering-IPcnt-%s/%s" % (
        ip_type, year)
    localPath = "/home/tjchung/research/rpki/src/spark/results/roas-covering-IPcnt-%s/%s" % (
        ip_type, year)
    try:
        hdfs.rmr(savePath)
    except:
        pass

    a  = sc.textFile(nro_stats)\
            .map(lambda v: parseNRO(v, ip_type))\
            .filter(lambda v: v is not None)\
            .reduceByKey(lambda a, b: a+ b)\
            .map(lambda ((time, rir), num_ips):  ( (time, rir), str(num_ips)))


    k  = sc.textFile(roa_prefix_asn)\
        .filter(lambda line: "#" not in line)\
        .map(lambda line: line.rstrip().split("\t"))\
        .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): isIPv4v6(prefix_addr, ip_type) and tal != "localcert")\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal))\
        .distinct()\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time, tal), (prefix_addr, prefix_len)))\
        .groupByKey()\
        .map(lambda ( (time, tal), list_ip_prefixes): ((time, tal), getNumIPs(list_ip_prefixes, ip_type)))\
        .join(a)\
        .map(lambda ((time, tal), (num_rpki_ips, all_ips)): (time, tal, "%s\t%s" % (num_rpki_ips, all_ips)))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'tal', 'num_ips'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.tal, row.num_ips)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, "0\t0") for k in ["date"] + tals})

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row["apnic"], row["lacnic"], row["ripencc"], row["arin"], row["afrinic"]))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #20

0

ファイルを表示

ファイル: spark-rpki-object-validation.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkPercentageASesInROAs(sc, ip_type):

    caida_as_org_days = [
        '20110420', '20110701', '20111003', '20120105', '20120401', '20120629',
        '20121002', '20130101', '20130401', '20130701', '20131001', '20140401',
        '20140701', '20141001', '20150101', '20150701', '20151001', '20160101',
        '20160401', '20160701', '20161001', '20170101', '20170401', '20170701',
        '20171001', '20180101', '20180401', '20180703', '20181001', '20190101'
    ]

    d = []
    for year in range(2011, 2020):
        d += createDates(year)

    caida_as_org_days = [d[0]] + caida_as_org_days + [d[-1]]
    as_org_days = dict.fromkeys(d, 0)

    min_date = datetime.strptime(d[0], "%Y%m%d")
    max_date = datetime.strptime(d[-1], "%Y%m%d")

    set_days_scope(caida_as_org_days, as_org_days, min_date, max_date)

    roa_prefix_asn = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roa-prefix-asn/*"
    savePath = "/hdfs-to-local-path/rpki/results/roas-covering-AScnt-%s" % ip_type
    localPath = "/home/tjchung/research/rpki/src/spark/results/roas-covering-AScnt-%s" % ip_type

    try:
        hdfs.rmr(savePath)
    except:
        pass

    tals = ["apnic", "lacnic", "ripencc", "arin", "afrinic"]

    a = runSparkGetTotalASNs(sc)

    k  = sc.textFile(roa_prefix_asn)\
        .filter(lambda line: "#" not in line)\
        .map(lambda line: line.rstrip().split("\t"))\
        .filter(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal):\
                    isIPv4v6(prefix_addr, ip_type) and \
                    tal != "localcert")\
        .map(lambda (time, prefix_addr, prefix_len, maxlen, asID, num_ips, cc, tal): ((time, tal.split("-")[0]), asID))\
        .distinct()\
        .groupByKey()\
        .map(lambda ( (time, tal), num_ases): ( (as_org_days[time], tal), (time, len(set(num_ases)))))\
        .join(a)\
        .map(lambda ((time, tal), ((real_time, num_activated_asns), all_asns)): (real_time, tal, "%s\t%s" % (num_activated_asns, all_asns)))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'tal', 'num_ASes'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.tal, row.num_ASes)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, "0\t0") for k in ["date"] + tals})

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row["apnic"], row["lacnic"], row["ripencc"], row["arin"], row["afrinic"]))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath,
                 label="\t".join(
                     ["#apnic", "lacnic", "ripencc", "arin", "afrinic"]))

コード例 #21

0

ファイルを表示

ファイル: spark-rpki-object-validation.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkNumROAs(sc):
    def parse(line):
        time_tal, filename, _, skid, akid, ee, roa = line.rstrip().split(",")
        time = time_tal[:8]
        tal = time_tal[9:-4]
        #time, tal = time_tal.replace(".txt", "").split("-")

        return (time, tal, filename)

    roas = "/hdfs-to-local-path/rpki/ripe/ripe-new-objects/roas/*"
    savePath = "/hdfs-to-local-path/rpki/results/num-roas"
    localPath = "/home/tjchung/research/rpki/src/spark/results/num-roas"

    try:
        hdfs.rmr(savePath)
    except:
        pass

    tals = [
        "apnic", "apnic-iana", "apnic-afrinic", "apnic-arin", "apnic-lacnic",
        "apnic-ripe", "lacnic", "ripencc", "arin", "afrinic", "localcert"
    ]


    k  = sc.textFile(roas)\
        .map(parse)\
        .distinct()\
        .map(lambda (time, tal, filename): ((time, tal), 1))\
        .reduceByKey(lambda a, b : a +b )\
        .map(lambda ( (time, tal), num_roas): (time, tal, num_roas))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['date', 'tal', 'num_roas'])

    grouped = df.rdd\
                .map(lambda row: (row.date, (row.tal, row.num_roas)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("date", k)])
        return Row(**{k: tmp.get(k, 0) for k in ["date"] + tals})

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['date'], row["apnic"], row["apnic-iana"], row["apnic-afrinic"], row["apnic-arin"], row["apnic-lacnic"], row["apnic-ripe"], row["lacnic"], row["ripencc"], row["arin"], row["afrinic"], row["localcert"]))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #22

0

ファイルを表示

ファイル: spark-analysis.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkCalcRPKIEnabledAdv(sc, dataset, ip_type, year):
    print "runSparkCalcRPKIEnabledAdv", dataset, year
    """
        It calculates the *number* of BGP announcements that 
            (1) can't be verified against RPKI
            (2) can be verified against RPKI and invalid
            (3) can be verified against RPKI and valid
        
        Note: 
            BGP announcements are distinct based on three tuples:
            (peer_ip, prefix_addr, as_path) given on a date
    """

    readPath = "/spark-hdfs-path/rpki/results/bgp-verify-nometa/%s/%s*" % (
        dataset, year)
    savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-adv-%s/%s/%s" % (
        ip_type, dataset, year)
    localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-enabled-adv-%s/%s/%s" % (
        ip_type, dataset, year)

    try:
        hdfs.rmr(savePath)
    except:
        pass

    isJson = False
    if (dataset == "akamai-public-prefix"): isJson = True
    hasMeta = False
    k = sc.textFile(readPath)\
            .map(lambda v: parseVerifyline(v, hasMeta, isJson))\
            .filter(lambda v: v is not None)\
            .filter(lambda v: isIPv4v6(v, ip_type))\
            .map(lambda j: ( (j['time'], classifyBGPAdvSparse(j)), 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda ( (time, rpki_type), cnt): (time, rpki_type, cnt))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['timestamp', 'rpkiType', 'cnt'])

    grouped = df.rdd\
                .map(lambda row: (row.timestamp, (row.rpkiType, row.cnt)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("timestamp", k)])
        return Row(
            **{
                k: tmp.get(k, 0)
                for k in
                ["timestamp", "non-rpki", "rpki-invalid", "rpki-valid"]
            })

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['timestamp'], row['non-rpki'], row['rpki-invalid'], row['rpki-valid']))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #23

0

ファイルを表示

ファイル: spark-analysis.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkValidationUniquePrefix(sc, dataset, ip_type):

    readPath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-%s/%s" % (
        ip_type, dataset)
    savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-adv-%s/%s" % (
        ip_type, dataset)

    localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-enabled-unique-prefix-asn-adv-%s/%s" % (
        ip_type, dataset)

    try:
        hdfs.rmr(savePath)
    except:
        pass


    k = sc.textFile(readPath)\
            .map(lambda v: parseVerifyLineUniquePrefix(v))\
            .filter(lambda v: v is not None)\
            .filter(lambda v: notDataError(dataset, v))\
            .filter(lambda v: isIPv4v6(v, ip_type))\
            .filter(lambda v: ip_type == "ipv6" or not isLargerSlash24(v))\
            .map(lambda j: ( (j['time'], classifyBGPAdvSparse(j)), 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda ( (time, rpki_type), cnt): (time, rpki_type, cnt))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['timestamp', 'rpkiType', 'cnt'])

    grouped = df.rdd\
                .map(lambda row: (row.timestamp, (row.rpkiType, row.cnt)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("timestamp", k)])
        return Row(
            **{
                k: tmp.get(k, 0)
                for k in
                ["timestamp", "non-rpki", "rpki-invalid", "rpki-valid"]
            })

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['timestamp'], row['non-rpki'], row['rpki-invalid'], row['rpki-valid']))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)

コード例 #24

0

ファイルを表示

ファイル: sparkdrunkdetection.py プロジェクト: cloud17shield/DrunkDetection

df_y = df['label'] == 3
df_X = df[['x' + str(i)
           for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]]
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=15)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

detector = dlib.get_frontal_face_detector()
hdfs.get("/drunkdetection/shape_predictor_68_face_landmarks.dat",
         "tmp/shape_predictor_68_face_landmarks.dat")
predictor = dlib.shape_predictor("tmp/shape_predictor_68_face_landmarks.dat")
fa = FaceAligner(predictor, desiredFaceWidth=300)
hdfs.get("/drunkdetection/drunk3.jpg", "/tmp/drunk3.jpg")
img = cv2.imread("/tmp/drunk3.jpg")
print(img)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = detector(gray, 1)

dic = {}
x_values = [[] for _ in range(48)]
y_values = [[] for _ in range(48)]

for face in faces:
    (x, y, w, h) = rect_to_bb(face)
    faceOrig = imutils.resize(img[y:y + h, x:x + w], width=300)

コード例 #25

0

ファイルを表示

ファイル: connecthadoop.py プロジェクト: kradanfi/pdmwebdashboard

import sys
import pydoop.hdfs as hdfs

#create package
#date = 'data/'+str(sys.argv[1])[2:]+'/'
st = '['
for x in hdfs.ls("data/18-02-21/"):
    #for x in hdfs.ls("date"):
    st = st + hdfs.load(x)

st = st.replace("\n", ",")
st = st[:-1]
st = st + ']'

#string4 = '{"input":"'+str(sys.argv[1])+'"}'

hdfs.dump(st, "test/hello.txt")
hdfs.get("test/hello.txt", "/tmp/tmp.txt")

コード例 #26

0

ファイルを表示

ファイル: spark-analysis.py プロジェクト: rpki-study/rpki-study.github.io

def runSparkClassifyHijackingUniquePrefix(sc, dataset, ip_type):

    readPath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-asn-%s/%s" % (
        ip_type, dataset)
    savePath = "/spark-hdfs-path/rpki/results/rpki-enabled-unique-prefix-classify-hijack-%s/%s" % (
        ip_type, dataset)
    localPath = "/local-spark-result-path/research/rpki/src/spark/results/rpki-unique-prefix-classify-hijack-%s/%s" % (
        ip_type, dataset)

    try:
        hdfs.rmr(savePath)
    except:
        pass

    k = sc.textFile(readPath)\
            .map(lambda v: parseVerifyLineUniquePrefix(v))\
            .filter(lambda v: v is not None)\
            .filter(lambda v: notDataError(dataset, v))\
            .filter(lambda v: isIPv4v6(v, ip_type))\
            .filter(lambda v: classifyBGPAdvSparse(v) == "rpki-invalid")\
            .filter(lambda v: ip_type == "ipv6" or not isLargerSlash24(v))\
            .filter(lambda v: onlyHijackAttempt(v))\
            .map(lambda v: ((v['time'], classifyHijack(v)), 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda ( (time, status), cnt): (time, status, cnt))

    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(k, ['timestamp', 'hijackType', 'cnt'])

    grouped = df.rdd\
                .map(lambda row: (row.timestamp, (row.hijackType, row.cnt)))\
                .groupByKey()

    def make_row(kv):
        k, vs = kv  # time: [(-1, cnt), (0, cnt), (1, cnt)] ...
        tmp = dict(list(vs) + [("timestamp", k)])
        return Row(
            **{
                k: tmp.get(k, 0)
                for k in [
                    "timestamp", "sameISP", "provider", "customer", "peer",
                    "DDoS", "None"
                ]
            })

    reshaped = sqlContext.createDataFrame(grouped.map(make_row))

    k = reshaped.rdd\
            .map(lambda row: (row['timestamp'], row['sameISP'], row['provider'], row['customer'], row['peer'], row['DDoS'], row['None']))\
            .map(toTSV)

    k.saveAsTextFile(savePath)

    try:
        shutil.rmtree(localPath)
    except:
        pass

    try:
        os.makedirs(localPath)
    except:
        pass

    hdfs.get(savePath, localPath)
    mergeAndSort(localPath)