def processFile(filep): from DataMining.code.com import log, parallels #, mongo_parallels import os from ujson import loads import gzip from pymongo import Connection try: c = Connection('localhost') db = c['tweets'] f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) locs = {} tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomething2(rec, db, filep) loc_lines += 1 if (loc_lines%10000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) return locs
def processFile(filep): from DataMining.code.com import log, parallels #, mongo_parallels import os from ujson import loads import gzip from redis import Redis c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0) # c = Connection('localhost') f = gzip.open(filep) logger = log.logger('Parallel/' + os.path.basename(filep)) logger.log('finding all records with location for: ' + f.name) times = {} tot_lines = 0 loc_lines = 0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec, times) loc_lines += 1 if (loc_lines % 10000 == 0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) return times
def processFile(filep): from DataMining.code.com import log, parallels #, mongo_parallels import os from ujson import loads import gzip # from redis import Redis c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0) # c = Connection('localhost') f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) times = {} tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec, times) loc_lines += 1 if (loc_lines%10000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} return times
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads, dumps import gzip logger = log.logger('Parallel/' + 'sampleCreate_' + os.path.basename(filep)) ret = {} try: f = gzip.open(filep) tot_lines = 0 loc_lines = 0 line = f.readline() logger.log('finding all records with location for: ' + f.name) outf = open( './DataMining/sample_data/' + os.path.basename(filep) + '_10000.sample', 'wb') while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: # write rec to outfile outf.write(dumps(rec) + '\n') loc_lines += 1 if (loc_lines % 10000 == 0): break logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) outf.close() except Exception as e: logger.log('Error log: ' + str(e)) return ret
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads import gzip locs = {} logger = log.logger('Parallel/AllLocsBigData_' + os.path.basename(filep)) try: f = gzip.open(filep) # f = open(filep) logger.log('finding all records with location for: ' + f.name) tot_lines = 0 loc_lines = 0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec, locs) loc_lines += 1 if (loc_lines % 10000 == 0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) except Exception as e: logger.log('Error log: ' + str(e)) # send the results to mongodb # logger.log('Sending to _ now..') # try: # helpers.write_all_locs_to_file('',[locs]) # except Exception as e: # logger.log('Error log: ' + str(e)) return locs
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads,dumps import gzip logger = log.logger('Parallel/'+'sampleCreate_'+os.path.basename(filep)) ret = {} try: f = gzip.open(filep) tot_lines =0 loc_lines =0 line = f.readline() logger.log( 'finding all records with location for: ' + f.name) outf = open('./DataMining/sample_data/'+os.path.basename(filep)+'_10000.sample', 'wb') while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: # write rec to outfile outf.write(dumps(rec)+'\n') loc_lines += 1 if (loc_lines%10000==0): break logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) outf.close() except Exception as e: logger.log('Error log: ' + str(e)) return ret
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads import gzip locs = {} logger = log.logger('Parallel/AllLocsBigData_'+os.path.basename(filep)) try: f = gzip.open(filep) # f = open(filep) logger.log( 'finding all records with location for: ' + f.name) tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec,locs) loc_lines += 1 if (loc_lines%10000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) except Exception as e: logger.log('Error log: ' + str(e)) # send the results to mongodb # logger.log('Sending to _ now..') # try: # helpers.write_all_locs_to_file('',[locs]) # except Exception as e: # logger.log('Error log: ' + str(e)) return locs