def processFile(filep): from DataMining.code.com import log, parallels #, mongo_parallels import os from ujson import loads import gzip from redis import Redis c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0) # c = Connection('localhost') f = gzip.open(filep) logger = log.logger('Parallel/' + os.path.basename(filep)) logger.log('finding all records with location for: ' + f.name) times = {} tot_lines = 0 loc_lines = 0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec, times) loc_lines += 1 if (loc_lines % 10000 == 0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) return times
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads, dumps import gzip outfilep = './DataMining/uncompressed/sel_cities/'+ os.path.basename(filep) + '.json' f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) locs = {} tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition_keywords(rec,parallels.sel_cities) if condition: parallels.bdDoSomething_keywords(rec,locs,parallels.keywords) loc_lines += 1 if (loc_lines%1000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.log('Writing json to file: ' + outfilep) f = open(outfilep,'wb') f.write(dumps(locs)) del locs return ret
def processFile(filep): from DataMining.code.com import log, parallels #, mongo_parallels import os from ujson import loads import gzip # from redis import Redis c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0) # c = Connection('localhost') f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) times = {} tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec, times) loc_lines += 1 if (loc_lines%10000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} return times
def processFile(filep): from DataMining.code.com import log, parallels #, mongo_parallels import os from ujson import loads import gzip from pymongo import Connection try: c = Connection('localhost') db = c['tweets'] f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) locs = {} tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomething2(rec, db, filep) loc_lines += 1 if (loc_lines%10000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) return locs
def processLocs(): print 'Starting all locations search:' logo = logger('AllLocs') bd = BigData(logo, status_line_count=10000) bd.obj = ka #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None) bd.processFiles(BigData.GetInputFiles(input_dir), None) return ka
def processLocs(): print 'Starting keyword search:' logo = logger('Keywords') bd = BigData(logo) bd.obj = ka #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None) bd.processFiles(BigData.GetInputFiles(input_dir), None) return ka
def __init__(self, params, outDir): ''' Constructor params: a list of city names outDir: is the output directory path ''' self.outDir = outDir self.d = dict((x,City2(x, self.getOutFile(x))) for x in params) self.logger = logger('Multiple_cities') self.curCity = ''
def __init__(self, params, outDir): """ Constructor params: a list of city names outDir: is the output directory path """ self.outDir = outDir self.d = dict((x, City2(x, self.getOutFile(x))) for x in params) self.logger = logger("Multiple_cities") self.curCity = ""
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads, dumps import gzip logger = log.logger('Parallel/' + 'sampleCreate_' + os.path.basename(filep)) ret = {} try: f = gzip.open(filep) tot_lines = 0 loc_lines = 0 line = f.readline() logger.log('finding all records with location for: ' + f.name) outf = open( './DataMining/sample_data/' + os.path.basename(filep) + '_10000.sample', 'wb') while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: # write rec to outfile outf.write(dumps(rec) + '\n') loc_lines += 1 if (loc_lines % 10000 == 0): break logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) outf.close() except Exception as e: logger.log('Error log: ' + str(e)) return ret
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads import gzip locs = {} logger = log.logger('Parallel/AllLocsBigData_' + os.path.basename(filep)) try: f = gzip.open(filep) # f = open(filep) logger.log('finding all records with location for: ' + f.name) tot_lines = 0 loc_lines = 0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec, locs) loc_lines += 1 if (loc_lines % 10000 == 0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) except Exception as e: logger.log('Error log: ' + str(e)) # send the results to mongodb # logger.log('Sending to _ now..') # try: # helpers.write_all_locs_to_file('',[locs]) # except Exception as e: # logger.log('Error log: ' + str(e)) return locs
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads,dumps import gzip logger = log.logger('Parallel/'+'sampleCreate_'+os.path.basename(filep)) ret = {} try: f = gzip.open(filep) tot_lines =0 loc_lines =0 line = f.readline() logger.log( 'finding all records with location for: ' + f.name) outf = open('./DataMining/sample_data/'+os.path.basename(filep)+'_10000.sample', 'wb') while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: # write rec to outfile outf.write(dumps(rec)+'\n') loc_lines += 1 if (loc_lines%10000==0): break logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) outf.close() except Exception as e: logger.log('Error log: ' + str(e)) return ret
def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads import gzip locs = {} logger = log.logger('Parallel/AllLocsBigData_'+os.path.basename(filep)) try: f = gzip.open(filep) # f = open(filep) logger.log( 'finding all records with location for: ' + f.name) tot_lines =0 loc_lines =0 line = f.readline() while line: #print line rec = loads(line) tot_lines += 1 condition = parallels.bdCheckCondition(rec) if condition: parallels.bdDoSomethingMemory(rec,locs) loc_lines += 1 if (loc_lines%10000==0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) except Exception as e: logger.log('Error log: ' + str(e)) # send the results to mongodb # logger.log('Sending to _ now..') # try: # helpers.write_all_locs_to_file('',[locs]) # except Exception as e: # logger.log('Error log: ' + str(e)) return locs
@author: gparuthi ''' from DataMining.code.com.log import logger from DataMining.code.com.BigData import BigData from DataMining.code.com.city import City from DataMining.code.com.cities import Cities import os CITY_NAME = 'london' params = { 'input_dir_path': '/Users/gaurav/Documents/Work/Projects/DataMining/data/', 'out_file_path': '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'+CITY_NAME+'/'+CITY_NAME+'.data', 'timeline_path':'/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'+CITY_NAME+'/'+CITY_NAME+'.timeline.json', 'logger' : logger('OneCity') } def GetInputFiles(dir): paths = [] for f in os.listdir(dir): paths.append(os.path.join(dir,f)) return paths def start(params): # crawl each data file and get data for the given location # store the data in the output file bd = BigData(params) city = City(CITY_NAME,bd,params['out_file_path']) input_files = bd.GetInputFiles(params['input_dir_path']) # Generate the tdf for the city
import gzip from ujson import loads,dumps from datetime import datetime import os from pprint import pprint from DataMining.code.com.log import logger from dateutil.parser import parse logo = logger('BigDataLocations') #LOGFILE_PATH = '/Users/gaurav/Documents/Work/Projects/DataMining/logs/' + 'BigData.'+str(datetime.now())+'.log' #LOGFILE = open(LOGFILE_PATH,'wb') def log(log_str): logo.log(log_str) # print str(log_str) # LOGFILE.write(str(log_str) + '\n') def log_final_stats(res): # res is an array of arrays # the element array is a list of format: '['filename', totlines, loc_lines ]' log ('----------------------------------------------------------------------') log ('Final results:' + str(res)) tot_lines = 0 loc_lines = 0 for r in res: tot_lines += r[1] loc_lines += r[2] log ('Total Lines found: ' + str (tot_lines)) log ('Total lines with coordinates: ' + str(loc_lines)) log ('----------------------------------------------------------------------') LOGFILE.close()
from DataMining.code.com.cities import Cities import os CITY_NAME = 'london' params = { 'input_dir_path': '/Users/gaurav/Documents/Work/Projects/DataMining/data/', 'out_file_path': '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/' + CITY_NAME + '/' + CITY_NAME + '.data', 'timeline_path': '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/' + CITY_NAME + '/' + CITY_NAME + '.timeline.json', 'logger': logger('OneCity') } def GetInputFiles(dir): paths = [] for f in os.listdir(dir): paths.append(os.path.join(dir, f)) return paths def start(params): # crawl each data file and get data for the given location # store the data in the output file bd = BigData(params) city = City(CITY_NAME, bd, params['out_file_path'])