def __init__(self, dirname): self._data = SequenceFile.Reader(os.path.join(dirname, DATA_FILE_NAME)) self._index = SequenceFile.Reader( os.path.join(dirname, INDEX_FILE_NAME)) self._first_position = self._data.getPosition() self._positions = [] self._keys = []
def convert_from_sequencefiles(encrypted_data): partition_pattern = os.path.join(encrypted_data, "data/part-*") partition_files = glob.glob(partition_pattern) output_partition_files = [] # Convert each partition from SequenceFile format to bytes for partition_file in partition_files: # Example taken from # https://github.com/matteobertozzi/Hadoop/blob/master/python-hadoop/examples/SequenceFileReader.py sequence_file_reader = SequenceFile.Reader(partition_file) key_class = sequence_file_reader.getKeyClass() value_class = sequence_file_reader.getValueClass() key = key_class() value = value_class() # FIXME: better way of generating intermediate file name output_partition_file = partition_file.replace("part-", "cpp-part-") # FIXME: Unclear if we need the below line # position = sequence_file_reader.getPosition() has_next = sequence_file_reader.next(key, value) if has_next: with open(output_partition_file, "wb") as partition: while has_next: partition.write(value.toBytes()) has_next = sequence_file_reader.next(key, value) # position = sequence_file_reader.getPosition() output_partition_files.append(output_partition_file) sequence_file_reader.close() return output_partition_files
def desequence(seq_file, output_path, get_fname=lambda k, i: "file" + str(i) + ".nc"): """ Takes a sequence file and writes out a separate NetCDF file for each value. seq_file: path to a seq file where the values are valid NetCDF binary blobs output_path: a string path to dump files to get_fname: a function which takes the key and an incrimental integer, and returns a string to be used as the file name. """ reader = SequenceFile.Reader(seq_file) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() i = 0 while reader.next(key, value): with open(output_path + get_fname(key, i), "wb") as f: f.write(value.getBytes()) i += 1 reader.close()
def init_pailfile_source(self, **kwargs): return PailfileSource( self.logger, self.loop, kwargs['gate'], SequenceFile.Reader(kwargs['input'][0].path), )
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='seq2tsv') # parser.add_argument() parser.add_argument("pathname") args = parser.parse_args(args) outputPathname = args.pathname + ".tsv" count = 0 start = datetime.datetime.now() with open(outputPathname, 'w') as f: reader = SequenceFile.Reader(args.pathname) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() # reader.sync(4042) position = reader.getPosition() while reader.next(key, value): # print '*' if reader.syncSeen() else ' ', print >> f, '%s\t%s' % (key.toString(), value.toString()) position = reader.getPosition() reader.close() end = datetime.datetime.now() delta = end - start print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta) return count
def count_file(filename): reader = SequenceFile.Reader(filename) key = Text() value = NullWritable() count = 0 while reader.next(key, value): count += 1 return count
def __init__(self): self._word_num = int(self.params['word_num']) self._document_num = int(self.params['document_num']) self._minibatch_size = int(self.params['minibatch_size']) self._meanchangethresh = float(self.params['meanchangethresh']) self._topic_num = int(self.params['topic_num']) self._tau0 = float(self.params['tau0']) self._updatect = float(self.params['updatect']) self._kappa = float(self.params['kappa']) rhot = pow(self._tau0 + self._updatect, -self._kappa) self._rhot = rhot # Load parameter from distributed cache parameter_reader = SequenceFile.Reader('./_params') key_class = parameter_reader.getKeyClass() value_class = parameter_reader.getValueClass() key_instance = key_class() value_instance = value_class() while parameter_reader.next(key_instance, value_instance): key_instance_str = key_instance.toString() if 'new_alpha' == key_instance_str: # For alpha self._alpha = value_instance.toString() self._alpha = numpy.fromstring(self._alpha) self._alpha.shape = self._topic_num elif 'new_lambda' == key_instance_str: # For lambda self._lambda = value_instance.toString() self._lambda = numpy.fromstring(self._lambda) self._lambda.shape = (self._topic_num, self._word_num) elif 'new_eta' == key_instance_str: # For eta self._eta = value_instance.toString() self._eta = numpy.fromstring(self._eta) self._eta.shape = self._word_num else: # Error sys.stderr.write("Something wrong in parameter_reader\n") sys.exit(1) parameter_reader.close() self._Elogbeta = self.dirichlet_expectation(self._lambda) self._expElogbeta = numpy.exp(self._Elogbeta) # initialize sstats self.sstats = numpy.zeros((self._topic_num, self._word_num)) self.gamma = numpy.zeros((self._minibatch_size, self._topic_num))
def seqReader(pathtpsaveimage): reader = SequenceFile.Reader(self.path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): position = reader.getPosition() name,d1,d2,ext=key.toString().split(".") print len(value.getBytes()) nparr = np.fromstring(value.getBytes(), np.uint8) img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) print np.array(img).size reader.close()
def seqReader(path): reader = SequenceFile.Reader(path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): position = reader.getPosition() name, d1, d2 = key.toString().split(".") nparr = np.array(value.toString().split(","), np.uint8).reshape(int(d1), int(d2)) #img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) print nparr.shape reader.close()
def SequenceFileIterator(path): reader = SequenceFile.Reader(path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): yield (position, key.toString(), value.toString()) position = reader.getPosition() reader.close()
def hadoop_input_stream(stream, size, url, params): stream.seek(0, 2) size = stream.tell() stream.seek(0) reader = SequenceFile.Reader(stream, length=size) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() while reader.next(key, value): yield key, value reader.close()
def exportSGY(rddFilename, sgyFilename): reader = SequenceFile.Reader(rddFilename) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() # reader.sync(4042) position = reader.getPosition() while reader.next(key, value): print('*' if reader.syncSeen() else ' ', '[%6s] %6s %6s' % (position, key.toString(), value.toString())) position = reader.getPosition() reader.close()
def seqReader(pathtpsaveimage): reader = SequenceFile.Reader(self.path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() compression_codec = BZip2Codec() while reader.next(key, value): position = reader.getPosition() name, d1, d2, ext = key.toString().split(".") arr = compression_codec.decompress(value.getBytes()) nparr = np.frombuffer(arr, np.uint8) try: img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) except AttributeError: img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) print name, img.shape reader.close()
def testRead(filename): reader = SequenceFile.Reader(filename) metadata = reader.getMetadata() for meta_key, meta_value in metadata: print 'METADATA:', meta_key, meta_value key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): print '*' if reader.syncSeen() else ' ', print '[%6s] %6s %6s' % (position, key.toString(), value.toString()) position = reader.getPosition() reader.close()
def main(): inputfiles = sys.argv[1] call(['mkdir', os.path.join(options.tmpdir, 'tmp')]) print "downloading inputfiles %s" % (inputfiles) check_call([ 'hadoop', 'fs', '-copyToLocal', inputfiles, os.path.join(options.tmpdir, 'tmp') ]) order = {} values = [] for fname in os.listdir(os.path.join(options.tmpdir, 'tmp')): reader = SequenceFile.Reader(os.path.join(options.tmpdir, 'tmp', fname)) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() while reader.next(key, value): order[int(key.get())] = value.get() reader.close() var = [] for key, val in sorted(order.iteritems()): var.extend(val) var2 = np.array(var) print "reading templatefile %s" % (options.template) templatefile = ep.ExoFile(options.template, 'r') print "writing outputfile %s" % (options.output) newfile = ep.ExoFile(options.output, 'w') result = insert_vars(templatefile, newfile, (options.varname, ), (var2, )) print "removing inputfiles %s" % (inputfiles) check_call(['rm', '-r', os.path.join(options.tmpdir, 'tmp')]) print "Done!"
def __init__(self): numpy.random.seed(100000001) self._word_num = int(self.params['word_num']) self._meanchangethresh = float(self.params['meanchangethresh']) self._topic_num = int(self.params['topic_num']) # Load parameter from distributed cache parameter_reader = SequenceFile.Reader('./_params') key_class = parameter_reader.getKeyClass() value_class = parameter_reader.getValueClass() key_instance = key_class() value_instance = value_class() while parameter_reader.next(key_instance, value_instance): key_instance_str = key_instance.toString() if 'new_alpha' == key_instance_str: # For alpha self._alpha = value_instance.toString() self._alpha = numpy.fromstring(self._alpha) self._alpha.shape = self._topic_num elif 'new_lambda' == key_instance_str: # For lambda self._lambda = value_instance.toString() self._lambda = numpy.fromstring(self._lambda) self._lambda.shape = (self._topic_num, self._word_num) elif 'new_eta' == key_instance_str: # For eta # loading useless continue else: # Error sys.stderr.write("Something wrong in parameter_reader\n") sys.exit(1) parameter_reader.close() self._Elogbeta = self.dirichlet_expectation(self._lambda) self._expElogbeta = numpy.exp(self._Elogbeta)
def loadDatainES(filename, index, doctype, dataFileType, hostname="localhost", port=9200, mappingFilePath=None): try: print "Connecting to " + hostname + " at port:" + str(port) # es = Elasticsearch([{'host': hostname, 'port': port}]) es = Elasticsearch([ 'https://*****:*****@' + hostname + ":" + str(port) ], show_ssl_warnings=False) if mappingFilePath: with open(mappingFilePath) as m: mapping = m.read() #print "Mapping file:" + mapping es.indices.create(index=index, body=mapping, ignore=400) if dataFileType == "1": with open(filename) as f: d = json.load(f) for wp in d: res = es.index(index=index, doc_type=doctype, body=wp, id=wp["uri"]) print "indexing id: " + res["_id"] + " for uri: " + wp[ "uri"] elif dataFileType == "0": with open(filename) as f: lines = f.readlines() for line in lines: if line.strip() != "": jsonurlobj = json.loads(line.strip()) objkey = jsonurlobj['uri'] res = es.index(index=index, doc_type=doctype, body=line) print "indexing id: " + res[ "_id"] + " for uri: " + objkey elif dataFileType == "2": reader = SequenceFile.Reader(filename) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() counter = 0 bulk_data = [] while reader.next(key, value): if value.toString().strip() != "": data_dict = {} line = value.toString() for i in range(len(line)): data_dict[header[i]] = line[i] op_dict = { "index": { "_index": index, "_type": doctype, "_id": data_dict["uri"] } } bulk_data.append(op_dict) bulk_data.append(data_dict) # //res = es.index(index=index,doc_type=doctype,body=value.toString(),id=objkey) # bulk index the data if counter % 10000 == 0: res = es.bulk(index=index, body=bulk_data, refresh=True) bulk_data = [] position = reader.getPosition() reader.close() print "Errors:" + str(i) except Exception, e: print >> stderr.write('ERROR: %s\n' % str(e)) pass
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from hadoop.io import SequenceFile if __name__ == '__main__': if len(sys.argv) < 2: print('usage: SequenceFileReader <filename>') else: reader = SequenceFile.Reader(sys.argv[1]) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() #reader.sync(4042) position = reader.getPosition() while reader.next(key, value): print('*' if reader.syncSeen() else ' ', end=' ') print('[%6s] %6s %6s' % (position, key.toString(), value.toString())) position = reader.getPosition()
redis_conn = redis.Redis(db=2) ids = {line.rstrip().upper():True for line in open('selected_ids_20_2')} file_list = ["full/" + x for x in filter(lambda x:"part-" in x, os.listdir("full"))] cores = multiprocessing.cpu_count() chunked_list = chunker(file_list, cores) pids = [] for x in range(0, cores): pid = os.fork() if pid == 0: for ef in chunked_list[x]: print("Proc %s doing %s" % (x, ef)) reader = SequenceFile.Reader(ef) kc = reader.getKeyClass() vc = reader.getValueClass() k,v = kc(), vc() while reader.next(k,v): ks = k.toString() if ks in ids: print(" setting %s" % ks) redis_conn.set(ks, extract_important(v.toString())) else: print("Not setting %s" % ks) sys.exit(0) else:
from hadoop.io import SequenceFile import time import json import pickle #setimgkeys=set() setvisualkeys = pickle.load(open("setvisualkeys.p", "r")) visualvaluesdict = dict.fromkeys(list(setvisualkeys)) for visualkey in setvisualkeys: visualvaluesdict[visualkey] = set() for part in xrange(1, 21): filename = "./trial01/part-r-000" + "%02d" % part reader = SequenceFile.Reader(filename) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): if not reader.syncSeen(): thisKey = key.toString() thisValue = value.toString() tmpj = json.loads(thisValue) #print tmpj for visualkey in setvisualkeys: try: #print list(tmpj['hasImagePart'].copy().keys()) visualvaluesdict[visualkey] = visualvaluesdict[ visualkey].union( [tmpj['hasImagePart'][visualkey]['featureValue']])
def test_hadoop_fs_destination_sequence_files(sdc_builder, sdc_executor, cluster): """Test Hadoop FS destination configuring File Type to Sequence File. We use sequence files with a EL expression in the sequence key file. We use SequenceFile module to read the generated file. Hadoop File is copied to local file system. """ # Configure Prefix and Sufix and Directory FILES_PREFIX, FILES_SUFFIX = 'tst', 'seq' hdfs_directory = f'/tmp/out/{get_random_string(string.ascii_letters, 10)}' # Get Pipeline Builder pipeline_builder = sdc_builder.get_pipeline_builder() # Create Dev Raw Data Stage raw_data = '\n'.join(json.dumps(product) for product in PRODUCT_DATA_FIX) logger.info('Pipeline will write to HDFS directory %s ...', hdfs_directory) dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='JSON', raw_data=raw_data, stop_after_first_batch=True) # Create Hadoop FS Destination hadoop_fs = pipeline_builder.add_stage('Hadoop FS', type='destination') hadoop_fs.set_attributes( data_format='JSON', directory_template=hdfs_directory, files_prefix=FILES_PREFIX, files_suffix=FILES_SUFFIX, file_type='SEQUENCE_FILE', compression_type='RECORD', sequence_file_key='${record:value(\'/sequenceKey\')}') # triggered the destination file to be closed after writing all data. hadoop_fs.set_attributes(max_records_in_file=len(PRODUCT_DATA_FIX)) dev_raw_data_source >> hadoop_fs # Build and Start Pipeline. After first batch it finishes. pipeline = pipeline_builder.build('Hadoop FS Destination Sequence Key' ).configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished(timeout_sec=10) try: # Check that just one file is in the directory hdfs_fs_files = cluster.hdfs.client.list(hdfs_directory) assert len(hdfs_fs_files) == 1 # Check the prefix and suffix hdfs_fs_filename = hdfs_fs_files[0] assert hdfs_fs_filename.startswith(FILES_PREFIX) assert hdfs_fs_filename.endswith(FILES_SUFFIX) # Download the file from HDFS to Local File System cluster.hdfs.client.download(f'{hdfs_directory}/{hdfs_fs_filename}', f'/tmp/{hdfs_fs_filename}') # Read the sequence file reader = SequenceFile.Reader(f'/tmp/{hdfs_fs_filename}') key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() # Convert list of dict to list of bytes product_data_expected = [ json.dumps(row, separators=(',', ':')).encode() for row in PRODUCT_DATA_FIX ] for i in range(2): # Read the information reader.next(key, value) # Check if name, price and release are in value assert product_data_expected[i] == value.toString() reader.close() finally: logger.info('Deleting Hadoop FS directory %s ...', hdfs_directory) cluster.hdfs.client.delete(hdfs_directory, recursive=True)