Python openの例、pydoop.hdfs.open Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop

 def open(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     with hdfs.open(test_path, "w") as f:
       f.write(self.data)
     f.fs.close()
     with hdfs.open(test_path) as f:
       self.assertEqual(f.read(), self.data)
     f.fs.close()

コード例 #2

0

ファイルを表示

ファイル: hdfs2mongo_distributed.py プロジェクト: legendlee1314/ooni

def xml_from_hdfs(url):
    with hdfs.open(url, "r") as f:
        lines = f.read().strip().split('\n')
        docs, doc = [], None
        for line in lines:
            if line.startswith('<doc'):
                doc = line
            elif line.startswith('</doc>'):
                docs.append(doc + line)
            else:
                #line = line.replace('&', '').replace('"', "'")
                doc += line.replace('"', "'")

        for doc in docs:
            dom = bs(doc).find('doc')
            doc = {}
            try:
                doc['id'] = dom.attrs['id']
                doc['url'] = dom.attrs['url']
                doc['title'] = dom.attrs['title']
            except AttributeError, e:
                continue
            doc['content'] = dom.text
            doc['md5'] = hashlib.md5(str(doc)).hexdigest()
            yield doc

コード例 #3

0

ファイルを表示

ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop

 def dump(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     hdfs.dump(self.data, test_path)
     with hdfs.open(test_path) as fi:
       rdata = fi.read()
     fi.fs.close()
     self.assertEqual(rdata, self.data)

コード例 #4

0

ファイルを表示

ファイル: avrolib.py プロジェクト: CynthiaYiqingHuang/pydoop

 def __init__(self, ctx):
     super(AvroReader, self).__init__(ctx)
     isplit = ctx.input_split
     self.region_start = isplit.offset
     self.region_end = isplit.offset + isplit.length
     self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
                                          DatumReader())
     self.reader.align_after(isplit.offset)

コード例 #5

0

ファイルを表示

ファイル: avrolib.py プロジェクト: CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)

コード例 #6

0

ファイルを表示

ファイル: try_input_format.py プロジェクト: IDR/pydoop-features

 def map(self, ctx):
     p = BioImgPlane(ctx.value)
     pixels = p.get_xy()
     bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
     fn = hdfs.path.join(self.out_dir, p.name, bn)
     with hdfs.open(fn, 'w') as fo:
         np.save(fo, pixels)
     ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))

コード例 #7

0

ファイルを表示

ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop

 def put(self):
   src = hdfs.path.split(self.local_paths[0])[-1]
   dest = self.hdfs_paths[0]
   with open(src, "w") as f:
     f.write(self.data)
   hdfs.put(src, dest)
   with hdfs.open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)

コード例 #8

0

ファイルを表示

ファイル: wordcount-rr.py プロジェクト: ilveroluca/pydoop

 def __init__(self, context):
   super(Reader, self).__init__()
   self.isplit = pp.InputSplit(context.getInputSplit())
   self.file = hdfs.open(self.isplit.filename)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)

コード例 #9

0

ファイルを表示

ファイル: map_only_python_writer.py プロジェクト: crs4/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     outfn = context.get_default_work_file()
     self.logger.info("writing to %s", outfn)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
     self.file = hdfs.open(outfn, "wt", user=hdfs_user)

コード例 #10

0

ファイルを表示

ファイル: hdfs2mongo.py プロジェクト: legendlee1314/ooni

def json_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    for fi in file_lists:
        with hdfs.open(fi, "r") as f:
            items = f.read().strip().split('\n')
            for it in items:
                it = loads(it)
                it['md5'] = hashlib.md5(str(it)).hexdigest()
                yield it

コード例 #11

0

ファイルを表示

ファイル: wordcount-full.py プロジェクト: ilveroluca/pydoop

 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)

コード例 #12

0

ファイルを表示

ファイル: wordcount_full.py プロジェクト: CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "w", user=hdfs_user)
     self.sep = jc.get("mapred.textoutputformat.separator", "\t")

コード例 #13

0

ファイルを表示

ファイル: pterasort.py プロジェクト: elzaggo/pydoop

 def _choose_break_points(cls, args):
     n_records, n_breakpoints, path = args
     block_size = n_records * RECORD_LENGTH
     with hdfs.open(path, 'r') as f:
         data = f.read(block_size)
     assert len(data) == block_size
     step = max(n_records // n_breakpoints, 1)
     keys = sorted([data[k:k + KEY_LENGTH]
                    for k in range(0, block_size, RECORD_LENGTH)])
     return [_ for _ in it.islice(keys, step, n_records, step)]

コード例 #14

0

ファイルを表示

ファイル: ioformats.py プロジェクト: elzaggo/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     self.logger.debug("part: %d", part)
     self.logger.debug("outdir: %s", out_dir)
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "wb", user=hdfs_user)

コード例 #15

0

ファイルを表示

ファイル: kafka-producer.py プロジェクト: bunop/ccc-capstone

def processLine(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            
            #Submit data (my function)
            submitLine(topic, line, trials=3)
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))

コード例 #16

0

ファイルを表示

ファイル: avrolib.py プロジェクト: wtj/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')

コード例 #17

0

ファイルを表示

ファイル: wordcount-full.py プロジェクト: ilveroluca/pydoop

 def __init__(self, context):
   super(Reader, self).__init__()
   self.logger = logging.getLogger("Reader")
   self.isplit = pp.InputSplit(context.getInputSplit())
   for a in "filename", "offset", "length":
     self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a)))
   self.file = hdfs.open(self.isplit.filename)
   self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)

コード例 #18

0

ファイルを表示

ファイル: ioformats.py プロジェクト: elzaggo/pydoop

 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     remainder = self.isplit.offset % RECORD_LENGTH
     self.bytes_read = 0 if remainder == 0 else RECORD_LENGTH - remainder
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset + self.bytes_read)

コード例 #19

0

ファイルを表示

ファイル: features.py プロジェクト: manics/pydoop-features

def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)

コード例 #20

0

ファイルを表示

ファイル: hadut.py プロジェクト: crs4/pydoop

def collect_output(mr_out_dir, out_file=None):
    """
    Return all mapreduce output in ``mr_out_dir``.

    Append the output to ``out_file`` if provided.  Otherwise, return
    the result as a single string (it is the caller's responsibility to
    ensure that the amount of data retrieved fits into memory).
    """
    if out_file is None:
        output = []
        for fn in iter_mr_out_files(mr_out_dir):
            with hdfs.open(fn, "rt") as f:
                output.append(f.read())
        return "".join(output)
    else:
        block_size = 16777216
        with open(out_file, 'a') as o:
            for fn in iter_mr_out_files(mr_out_dir):
                with hdfs.open(fn) as f:
                    data = f.read(block_size)
                    while len(data) > 0:
                        o.write(data)
                        data = f.read(block_size)

コード例 #21

0

ファイルを表示

ファイル: hadoopReader.py プロジェクト: davedwards/beautiful-data

def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())

コード例 #22

0

ファイルを表示

ファイル: wordcount_full.py プロジェクト: CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         discarded = self.file.readline()
         self.bytes_read += len(discarded)

コード例 #23

0

ファイルを表示

ファイル: checkrecords.py プロジェクト: elzaggo/pydoop

def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])

コード例 #24

0

ファイルを表示

ファイル: check.py プロジェクト: crs4/pydoop

def check_transpose(mr_out_dir):
    output = []
    for fn in hadut.iter_mr_out_files(mr_out_dir):
        with hdfs.open(fn, "rt") as f:
            for line in f:
                row = line.rstrip().split("\t")
                index = int(row.pop(0))
                output.append((index, row))
    output = [_[1] for _ in sorted(output)]
    exp_output = []
    in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt")
    with open(in_fn) as f:
        for line in f:
            for i, item in enumerate(line.split()):
                try:
                    exp_output[i].append(item)
                except IndexError:
                    exp_output.append([item])
    return output == exp_output

コード例 #25

0

ファイルを表示

ファイル: kafka-producer.py プロジェクト: bunop/ccc-capstone

def processChunk(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        data = []
        
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            data += [line]
            
            if i % 5000 == 0:
                #Submit data (my function)
                submitChunk(topic, data, trials=3)
                data = []
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
                
        #for every line
        #submit the rest of the data
        submitChunk(topic, data, trials=3)
        data = []

コード例 #26

0

ファイルを表示

ファイル: hdfs2mongo.py プロジェクト: legendlee1314/ooni

def xml_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    #for fi in file_lists:
    for i in xrange(0, 1):
        fi = '/datasets/corpus/enwiki-11g/wiki_912'
        with hdfs.open(fi, "r") as f:
            lines = f.read().strip().split('\n')
            docs, doc = [], None
            for line in lines:
                if line.startswith('<doc'):
                    doc = line
                elif line.startswith('</doc>'):
                    docs.append(doc + line)
                else:
                    #line = line.replace('&', '').replace('"', "'")
                    doc += line.replace('"', "'")

            for doc in docs:
                dom = bs(doc).find('doc')
                doc = dom.attrs
                doc['content'] = dom.text
                doc['md5'] = hashlib.md5(str(doc)).hexdigest()
                yield doc

コード例 #27

0

ファイルを表示

        sys.exit(1)
    else:
        return ratings

def computeRmse(model, data, n):
    """
    Compute RMSE (Root Mean Squared Error).
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
      .values()
    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))
for n in userArray:
	with open(uFile, "w") as fi:
        with hdfs.open('/user/cloudera/medium/ratings.dat') as f:
            for line in f:
                data = line
                userid = line.split("::")
                if (int(userid[0]) == int(n)):
                    fi.write(data)
                    print n
	f.close()
	if __name__ == "__main__":
		if (len(sys.argv) != 2):
			print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
			  "MovieLensALS.py movieLensDataDir"
			sys.exit(1)

		# set up environment
		conf = SparkConf() \

コード例 #28

0

ファイルを表示

ファイル: forecast.py プロジェクト: AndraAnoaica/ENSAI-ANOAICA-TARDIVEL

import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open("hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv") as f:
    df =  pd.read_csv(f)
    
    
    df=pd.read_csv('/user/cloudera/python/cities_location.csv') 
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015,11,1,2,0,0)
    for i in range(len(df)):
        col = ["cities", "time",  "temperatureMin", "temperatureMax"]
        lat=df["latitude"].iloc[i]
        lng=df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day=day.data[0]
        data={"cities": df["cities"].iloc[i], "time" : Day.time, "temperatureMin" : Day.temperatureMin, "temperatureMax" : Day.temperatureMax}
        if i==0 :
            weather = pd.DataFrame(data, index=[0], columns= col)
        else:
            weather1 = pd.DataFrame(data, index=[0], columns= col)
            weather = pd.concat([weather, weather1], ignore_index=True)

コード例 #29

0

ファイルを表示

        result = math.pow(math.e, -0.5 * (x_mu * inverse * x_mu.T))
        return norm_const * result
    else:
        raise NameError("The dimensions of the input don't match")


#import pydoop.hdfs as hdfs
k = 5

#using Hadoop system file
#with hdfs.open('/Users/ming/centroids.txt') as fp:

weights = []
means = []
sigmas = []
with hdfs.open('/Users/user06/parameters.txt') as file:
    for line in file:
        params = line.strip().split("\t")
        weights.append(float(params[0]))
        means.append(np.array(params[1].split(), float))
        sigmas.append(np.array(params[2].split(), float))

for line in sys.stdin:
    line = line.strip()
    point = np.array(line.split(), float)
    p = weights[0] * norm_pdf_multivariate(point, means[0], sigmas[0].reshape(
        (2, 2)))
    nearest = 0
    for i in range(1, k):
        q = weights[i] * norm_pdf_multivariate(point, means[i],
                                               sigmas[i].reshape((2, 2)))

コード例 #30

0

ファイルを表示

ファイル: i7get_from_hdfs.py プロジェクト: greatabel/DataAnalysis

import pydoop.hdfs as hdfs


b = hdfs.path.isdir("/data")

want_file = 'traffic.csv'

if b == True:
    print("---get test ---")
    lines = []
    with hdfs.open("hdfs://127.0.0.1:9000/data/"+want_file) as f:
        for line in f:
            # print(line, type(line))
            l = line.decode("utf-8")
            if l is not None and l != "":
                lines.append(l)
    print(lines)
    print("---end get----")

    with open("i8predict_flow/"+want_file, "wb") as myfile:
        myfile.write(str(lines))

コード例 #31

0

ファイルを表示

ファイル: AnomalyDetection_CreditCardData.py プロジェクト: guptashivam27/Anomaly-Detection

##Importing Required Packages
import numpy as np
import pydoop.hdfs as hd
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_curve, precision_recall_curve, average_precision_score

##Loading Credit Card Dataset
with hd.open("/user/hduser/creditcard.csv") as f:
    CreditCardData = pd.read_csv(f, header=0)

##Reducing the number of records of Original Dataset incase we wish to work on a smaller subset of Dataset
ReducedData = CreditCardData.iloc[:, :]

##Shape of Credit Card Dataset, i.e. number of rows & columns present in Dataset
print("\nShape of Credit Card Dataset (rows, columns): " +
      str(ReducedData.shape))

##Removing Duplicate Records (if any)
FinalData = ReducedData.drop_duplicates()
print(
    "\nShape of Credit Card Dataset after removing duplicate records (rows, columns): "
    + str(FinalData.shape))

##Checking for missing values

コード例 #32

0

ファイルを表示

ファイル: saveHDFStoS3.py プロジェクト: modupeEIT/Mindbenders_BD

import pydoop.hdfs as hdfs
import boto3
import botocore

s3 = boto3.resource('s3')

BUCKET = "bd-mindbenders12345"

file = hdfs.open("hdfs://localhost:9000/test.txt")

s3.Bucket(BUCKET).put_object(Key="test.txt", Body=file)

コード例 #33

0

ファイルを表示

ファイル: hdfs.py プロジェクト: clrke/hdfs-test

import pydoop.hdfs as hdfs
import config.hdfs

with hdfs.open(config.hdfs['ur']) as f:
    for line in f:
        print(line)

コード例 #34

0

ファイルを表示

ファイル: local.py プロジェクト: vadirajmkulkarni/DS222-Assignment

i = 0
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


vocab = Counter()
labels = Counter()

with hdfs.open(
        '/user/ds222/assignment-1/DBPedia.verysmall/verysmall_train.txt') as f:
    for line in f:
        first, next = line.split(' ', 1)
        for label in first.split(','):
            labels[label] += 1
        words = next.strip().lower().split()
        for word in words:
            if (len(word) >= 4):
                if (word[0] != '<'):
                    vocab[word] += 1
        i = i + 1
#print(i)
#print(counter)


#Convert words to indexes

コード例 #35

0

ファイルを表示

# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(5, 1000,
                                            '/user/root/genrecords_output')
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)

コード例 #36

0

ファイルを表示

ファイル: indexData.py プロジェクト: ipuneetrathore/searchSongs

	HDFSfiles.append(hdFiles[41:])


fileNames = []

indexName = 'music'
typeName = 'songs'
#IdField = 'songID'


bulkData = [] 

i = 1
for name in HDFSfiles:	
	dataDict = {}
       	fopen=hdfs.open("/gaana/gaanaLyrics/"+name)
	header = fopen.read()
       	header = re.sub('[^a-zA-Z]', ' ', header)
	header = header.replace("Advertisements"," ")
       	header = ''.join([item.lower() for item in header]) 
	songAndMovie = []
       	dlim = "lyrics"
#	nameNew = name.replace("-"," ")
	songAndMovie.append(name)
        dataDict[name] = header
	metaDict = {}
	dataDict = {}	
       	for elements in songAndMovie:
               	songsName = []
#              	if "lyrics" in elements:
		songName = elements.split('-')

コード例 #37

0

ファイルを表示

ファイル: DataLake_stats.py プロジェクト: fmilagres/data_engineering

from pandasql import sqldf
import os


login=''
senha=''

os.system('echo '+senha+' | kinit '+login)
dir = '/ranger/audit/hiveServer2/'
list = hdfs.ls(dir)

df = pd.DataFrame()
for pasta in list:
    for i in range(len(hdfs.ls(pasta))):
        try:
            with hdfs.open(hdfs.ls(pasta)[i], 'r') as f:
                jsn = [json.loads(line) for line in f]
                df = df.append([pd.DataFrame(jsn)], sort=True)
            
        except:
                print("Leitura do arquivo json em " + hdfs.ls(pasta)[i] + " não foi bem sucedida")

df1 = df[['evtTime','reqUser','resource','access','reqData']]
df1['reqUser'] = df1['reqUser'].str.upper()
df1 = df1[df1['access']=='SELECT']
# exclusao de usuarios de servico
exclusao = pd.DataFrame(['HIVE','RANGERLOOKUP'])
df1 = df1[~df1.reqUser.isin(exclusao.iloc[:,0])]
df1['evtTime'] = pd.to_datetime(df1['evtTime'].str[0:16], format='%Y-%m-%d %H:%M')

spark_df = spark.createDataFrame(df1)

コード例 #38

0

ファイルを表示

ファイル: socialmedia_sentiment_pullfrommongodb.py プロジェクト: rdstanley/untitled

            {"$group": {"_id": {'source':"$source",'tags':"$tags",'year': "$year_posted",'month':"$month_posted",'day':"$day_posted"}, "count": {"$sum": 1},"countNegative":{"$sum":"$Negative"},"countNeutral":{"$sum":"$Neutral"},"countPositive":{"$sum":"$Positive"}}},
            {"$sort": SON([("count", -1), ("_id", -1)])}
        ])
            #use reportdate for the filename
        filename = startdate.strftime('%Y-%m-%d')
        print(filename)
        for result_obj in daily_totals['result']:
            data_dict = result_obj['_id']
            date = (str(data_dict['year']) + "-" +  str(data_dict['month']) + "-" + str(data_dict['day']))
            tag = data_dict['tags']
            source = data_dict['source']
            count = result_obj['count']
            countPositive = result_obj['countPositive']
            countNegative = result_obj['countNegative']
            countNeutral = result_obj['countNeutral']
            data = (str(date) + "|" + str(tag) + "|" + str(source) + "|" + str(count) + "|" + str(countPositive) + "|" + str(countNegative) + "|" + str(countNeutral)+'\n')
            print(data)
            hdfs_path = '/socialmedia/sentiment/' + filename
            hdfs_path = settings.HDFS_HOST_NAME + ':' + settings.HDFS_PORT + settings.HDFS_ROOT_FOLDER + \
                        '/socialmedia/sentiment' + filename + '.in'
            logger.info('HDFS file path: %s' % hdfs_path)
            logger.debug('Data: %s' % data)

            try:
                hdfs_file = hdfs.open(hdfs_path, mode='a')
                hdfs_file.write(data.encode('utf-8'))
            except IOError, e:
                logger.debug("IOError: " + e.message)
                logger.debug("Caught Exception. Will create a new file on hdfs.")
                hdfs_file = hdfs.open(hdfs_path, mode='w')
                hdfs_file.write(data.encode('utf-8'))

コード例 #39

0

ファイルを表示

        colourImg = PIL.Image.open(imgFile)
        #imshow(np.asarray(colourImg))
        nparray = np.asarray(colourImg)
        image = cv2.cvtColor(nparray, cv2.COLOR_RGB2BGR)

    return image


brand = "logitech"
mode = "image"

os.chdir("/tmp/")
myMachine = kpath.abspath('/tmp/data/input/racetrack/image/')
print(myMachine)

with hpath.open(myMachine + "driving_log.csv") as csvFile:
    df = pd.read_csv(csvFile,
                     names=[
                         "image_center", "image_left", "image_right",
                         "steering", "speed"
                     ])

#next(df.iterrows())[1]
df.iterrows()

# read and store multiple cameras and steering angles from driving_log.csv
# all three camera images will be used to train the model
images = []
steering_measurements = []

for index, row in df.iterrows():

コード例 #40

0

ファイルを表示

	

import pydoop.hdfs as hdfs 
import logging
logging.basicConfig(level = logging.DEBUG)

# тест проверяет наличие строки в выходых файлах
# для каждой строки ищется ее пара в директории с нужной датой

with open('file1.csv','r') as in_f:
	for it,in_line in enumerate(in_f):
		date=in_line.strip().split(',')[0]
		for part in [1,2,3]:
			with hdfs.open('/data/archive/'+date+'/part-0000'+str(part)) as out_f:
				matching=[]
				for out_line in out_f:
					a=set(out_line.strip().split(','))
					if a==set(in_line.strip().split(',')):
						matching.append(True) 
						break
					else:
						matching.append(False)
			if any(matching):
				matching=True
				break
        	if not matching:            
			logging.debug("Error on line %s ,%s",it,in_line)

コード例 #41

0

ファイルを表示

 def __missing__(self, path):
     f = hdfs.open(path, "wb")
     self[path] = f
     return f

コード例 #42

0

ファイルを表示

import pydoop.hdfs as hdfs

for part in [1, 2, 3]:
    with hdfs.open('/data/archive/2014-04-29/part-0000' + str(part)) as out_f:
        with open('file1.csv', 'r') as in_f:
            for out_line in out_f:
                for in_line in in_f:
                    a = set(out_line.strip().split(','))
                    if a == set(in_line.strip().split(',')):
                        print True
                    else:
                        print False
                        print a
                        print set(in_line.strip().split(','))

コード例 #43

0

ファイルを表示

 def __init__(self, context):
     super(Reader, self).__init__()
     self.logger = logging.getLogger("Reader")  #formatted logger obtained
     self.file = hdfs.open('HD-2004-2014-d.csv')
     self.logger.debug("readline chunk size = %r" % self.file.chunk_size)

コード例 #44

0

ファイルを表示

ファイル: mapper.py プロジェクト: Jeck96/CORNO_GRANDE_PROGETTO_1_BIG-DATA

#!/usr/bin/python3
"""mapper.py"""
import sys
import csv
import pydoop.hdfs as hdfs
import json
import costanct as C

azienda_map = {}
with hdfs.open('input/historical_stocks.csv', 'rt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0:
            ticker, _, name, _, _ = row
            azienda_map[ticker] = {'name': name}
        line_count += 1


def toJson(azione):
    dic = {
        "ticker": azione[0],
        "name": azienda_map[azione[0]],
        "close": azione[2],
        "date": azione[7],
    }
    return json.dumps(dic)


for line in sys.stdin:
    azione = line.split(',')

コード例 #45

0

ファイルを表示

        context.setStatus("initializing")

    def map(self, context):
        k = context.getInputKey()
        tmp_data = csv.reader(f)
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")
            context.incrementCounter(self.inputWords, len(words))

    def close(self):
        self.logger.info("all done")


print "Prediction on HD 30 year data:"
f = hdfs.open('/HD-1984-2014-d.csv')
tmp_data = csv.reader(f)

my_data = list()
for item in tmp_data:
    tmp_item = list()
    for i in item:
        tmp_item.append(i)
    my_data.append(tmp_item)
data = my_data[1:]
X = list()
training_indices = list()
for i in xrange(int(len(data) * 0.9)):
    training_indices.append(i)

test_indices = list()

コード例 #46

0

ファイルを表示

import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession

import pandas as pd
import pydoop.hdfs as hd

# create a spark session
#sparkSession = SparkSession.builder.master("local").appName("draw heat map").getOrCreate()
#df_load = sparkSession.read.csv('hdfs://dumbo/user/gx271/pubgETL/mir_death.csv')

with hd.open("hdfs://dumbo/user/gx271/pubgETL/mir_death.csv/part-00006") as f:
    df = pd.read_csv(f)

# convert DataFrame to np array

dat = df.as_matrix()

# dat = np.loadtxt('mydata.csv')

x, y = dat[:,0], dat[:,1]

heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)  
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]  
plt.clf()  
plt.imshow(heatmap, extent=extent)  
# plt.show()

コード例 #47

0

ファイルを表示

def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)

コード例 #48

0

ファイルを表示

ファイル: data_s3.py プロジェクト: sapoondutta-eit/Mindbender_BD

bucket = 'enhance-it'
import boto3
import pydoop.hdfs as hdfs 

s3 = boto3.resource('s3')


file = hdfs.open('hdfs://master:9000/data_for_db/currency=USD/part-00000-f65c545f-baa0-4bf0-8aa9-0b14957848c4.c000.json')
s3.Bucket(bucket).put_object(Key='lituation/data_from_hdfs.csv', Body=file)

コード例 #49

0

ファイルを表示

ファイル: test_partitioner.py プロジェクト: elzaggo/pydoop

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(
    5, 1000, '/user/root/genrecords_output'
)
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)

コード例 #50

0

ファイルを表示

#! /usr/bin/env python

import sys
from pydoop import hdfs
from DataPoint import DataPoint

#print "Start"

# read sys.argv[1] and sys.argv[2]
# put em in lists

if len(sys.argv)<3:
	print "Error: Insufficient Arguments"
	sys.exit(-1)

oldCentroidsFile = hdfs.open(sys.argv[1])
newCentroidsFile = hdfs.open(sys.argv[2])

oldCentroids = []
newCentroids = []

for line in oldCentroidsFile:
	if line.find("\t") != -1:
		(key,value) = line.strip().split("\t")
		oldCentroid = DataPoint(value)
	else:
		oldCentroid = DataPoint(line.strip()) 
	oldCentroids.append(oldCentroid)

for line in newCentroidsFile:
	(key,value) = line.strip().split("\t")

コード例 #51

0

ファイルを表示

ファイル: mapperStg2.py プロジェクト: bread-tan/canopyClusteringPython

#! /usr/bin/env python

import sys
import DataPoint
from pydoop import hdfs

# Check for sufficient arguments
if len(sys.argv) < 2:
    print("ERROR: Insufficient arguments")
    sys.exit(-1)

# List to hold canopy centers
canopyCenters = []

# Read canopy center file
file = hdfs.open(sys.argv[1])
for line in file:
    if line.find("Warning:") == 0:
        continue
    (key, value) = line.split("\t")
    dp = DataPoint.DataPoint(value.strip())
    canopyCenters.append(dp)

# Assign points to canopies
for line in sys.stdin:
    dp = DataPoint.DataPoint(line.strip())
    insert = True
    for canopyCenter in canopyCenters:
        if dp.checkT1(canopyCenter):
            print(canopyCenter.toString() + "\t" + dp.toString())

コード例 #52

0

ファイルを表示

### Parte de leer los datos de hdfs ###
import pandas as pd
import numpy as np
import pydoop.hdfs as hd
from lxml import objectify

with hd.open("/user/datostiempo/20160525_1341.xml") as archivo:
    parsed = objectify.parse(archivo)

root = parsed.getroot()
prob_precipitacion = []
estado_cielo = []
viento = []
temperatura = []
tempmax = []
tempmin = []
iteraccion = 0
errores = []
print "root : ", root
for row in root.prediccion.dia:
    for row_precipitacion in row.prob_precipitacion:
        aux_precipitacion = []
        if (row_precipitacion != ''):
            aux_precipitacion.append(row_precipitacion)
        else:
            errores.append(1)
    prob_precipitacion.append(
        str(sum(aux_precipitacion) / float(len(aux_precipitacion))))
    for row_cielo in row.estado_cielo:
        aux_cielo = []
        if (row_cielo != ''):

コード例 #53

0

ファイルを表示

ファイル: sparkdrunkdetection.py プロジェクト: cloud17shield/DrunkDetection

import os
import pandas as pd
import pydoop.hdfs as hdfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

conf = SparkConf().setAppName("drunk detection").setMaster("yarn")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)

csv_file_path = "hdfs:///drunkdetection/train_data48.csv"
predictor_path = "hdfs:///drunkdetection/shape_predictor_68_face_landmarks.dat"
image_path = "hdfs:///drunkdetection/drunk3.jpg"
model_path = "hdfs:///drunkdetection/rf48.pickle"

with hdfs.open("/drunkdetection/train_data48.csv") as csv:
    df = pd.read_csv(csv, index_col=0)
print(df.columns)
df_y = df['label'] == 3
df_X = df[['x' + str(i)
           for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]]
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=15)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

コード例 #54

0

ファイルを表示

ファイル: forecast.py プロジェクト: AndraAnoaica/ENSAI-HiveQLscript

import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open(
        "hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv"
) as f:
    df = pd.read_csv(f)

    df = pd.read_csv('/user/cloudera/python/cities_location.csv')
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015, 11, 1, 2, 0, 0)
    for i in range(len(df)):
        col = ["cities", "time", "temperatureMin", "temperatureMax"]
        lat = df["latitude"].iloc[i]
        lng = df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day = day.data[0]
        data = {
            "cities": df["cities"].iloc[i],
            "time": Day.time,
            "temperatureMin": Day.temperatureMin,
            "temperatureMax": Day.temperatureMax
        }