from dip.util import timetool
import sys
import random

reload(sys)
sys.setdefaultencoding("utf-8")

import re
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType
import json
import time

conf = SparkConf().setAppName(
    "app_picserversweibof6vwt_wapvideodownload_to_hdfs")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

try:
    source = sc.textFile(
        "/user/hdfs/rawlog/app_picserversweibof6vwt_wapvideodownload/" + timetool.getHDFSDayDir(sys.argv[1]))

    pattern = re.compile("^([^`]*)`([^`]*)")

    def lineParse(line):
        matcher = pattern.match(line)

        if not matcher:
            return None
Exemple #2
0
from pyspark import SparkConf
from dip.spark import SparkContext
from pyspark.sql import HiveContext

conf = SparkConf().setAppName("spark_parse")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)


def printRows(rows):
    for row in rows:
        print row

rows = sc.extract_text_to_arr("hdfs://dip.cdh5.dev:8020/user/yurun/text",
                              "delimiter", " ", [str, int, str, str, str], lambda words: words[0] == "1").collect()

printRows(rows)

sc.extract_text_to_arr("hdfs://dip.cdh5.dev:8020/user/yurun/text",
                       "regex", "(.*) (.*) (.*) (.*) (.*)", filter=lambda words: True).transform_arr(lambda words: [words[0].upper()], [int], lambda words: words[0] == 1).load_arr_to_table(hc, "temp_table", [("first", int, False)])

rows = hc.sql("select * from temp_table").collect()

printRows(rows)

sc.stop()
from pyspark import SparkConf
from dip.spark import SparkContext

conf = SparkConf().setAppName("spark_textFiles_test")

sc = SparkContext(conf=conf)

dirs = ["hdfs://dip.cdh5.dev:8020/user/yurun/dir1",
        "hdfs://dip.cdh5.dev:8020/user/yurun/dir2"]


def printLines(lines):
    if lines:
        for line in lines:
            print line

lines = sc.textFiles(dirs).collect()

printLines(lines)

sc.stop()