def processLocs(): print 'Starting keyword search:' logo = logger('Keywords') bd = BigData(logo) bd.obj = ka #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None) bd.processFiles(BigData.GetInputFiles(input_dir), None) return ka
def processLocs(): print 'Starting all locations search:' logo = logger('AllLocs') bd = BigData(logo, status_line_count=10000) bd.obj = ka #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None) bd.processFiles(BigData.GetInputFiles(input_dir), None) return ka
def start(params): # crawl each data file and get data for the given location # store the data in the output file bd = BigData(params) city = City(CITY_NAME, bd, params['out_file_path']) input_files = bd.GetInputFiles(params['input_dir_path']) # Generate the tdf for the city city.generateTDF(input_files) # get nouns for the city city.getNounsTDF() # load another bigData obj for generating timeline params = { 'input_dir_path': '', 'input_file_path': city.filep, 'out_file_path': None, 'logger': params['logger'] } bd = BigData(params) # get timeline for city city.getTimeLine(bd) # write timeline to file city.writeTimelineToFile(params['timeline_path'])
from IPython import parallel from datetime import datetime from DataMining.code.com import log,parallels import os rc= parallel.Client() lview = rc.load_balanced_view() lview.block = True from DataMining.code.com.BigData import BigData input_files = BigData.GetInputFiles('./DataMining/data/') @lview.parallel() def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads, dumps import gzip outfilep = './DataMining/uncompressed/sel_cities/'+ os.path.basename(filep) + '.json' f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) locs = {} tot_lines =0 loc_lines =0 line = f.readline() while line:
def getTimeLine(self): bd = BigData(self.logger) for k in self.d: self.d[k].getTimeLine(bd)
def generateTDF(self, input_files): self.bd = BigData(self.logger) self.bd.obj = self self.bdCheckCondition = self.CheckCondition self.bdDoSomething = self.DoSomething self.bd.processFiles(input_files,None)
class Cities(object): ''' classdocs ''' def __init__(self, params, outDir): ''' Constructor params: a list of city names outDir: is the output directory path ''' self.outDir = outDir self.d = dict((x,City2(x, self.getOutFile(x))) for x in params) self.logger = logger('Multiple_cities') self.curCity = '' def getOutFile(self, city_name): # return the output file path for the input city return os.path.join(self.outDir, city_name + '.data') def generateTDF(self, input_files): self.bd = BigData(self.logger) self.bd.obj = self self.bdCheckCondition = self.CheckCondition self.bdDoSomething = self.DoSomething self.bd.processFiles(input_files,None) def CheckCondition(self, rec): if 'user' in rec: user_data = rec['user'] if 'location' in user_data: if user_data['location'] != None: loc = user_data['location'].lower() for k in self.d: if k in loc: self.curCity = k # print k + ':' + loc return True else: return False return False def DoSomething(self, rec): text = rec['text'].encode('utf-8') loc = self.curCity #print loc sentences = nltk.sent_tokenize(text) # NLTK default sentence segmenter sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer #sentences = post_tag(sentences) #print sentences for sent in sentences: #print sent #a = nltk.pos_tag(sent) for w in sent: if w in self.d[loc].tdf: self.d[loc].tdf[w] +=1 else: self.d[loc].tdf[w] = 1 # write output to outfile self.d[loc].write(dumps(rec)) def getNounsTDF(self): for k in self.d: self.d[k].getNounsTDF() def getTimeLine(self): bd = BigData(self.logger) for k in self.d: self.d[k].getTimeLine(bd)
parallels.bdDoSomethingMemory(rec, locs) loc_lines += 1 if (loc_lines % 10000 == 0): logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines)) line = f.readline() ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines} logger.send_final_stats(ret) except Exception as e: logger.log('Error log: ' + str(e)) # send the results to mongodb # logger.log('Sending to _ now..') # try: # helpers.write_all_locs_to_file('',[locs]) # except Exception as e: # logger.log('Error log: ' + str(e)) return locs # def start(input_files): # print 'starting now..' # starttime = datetime.now() # res = processFile.map(input_files) # print 'time taken= '+ str(datetime.now()-starttime) input_files = BigData.GetInputFiles(settings.INPUT_DIR) print 'starting now..' starttime = datetime.now() res = processFile.map(input_files) print 'time taken= ' + str(datetime.now() - starttime)
def generateTDF(self, input_files): self.bd = BigData(self.logger) self.bd.obj = self self.bdCheckCondition = self.CheckCondition self.bdDoSomething = self.DoSomething self.bd.processFiles(input_files, None)
class Cities(object): """ classdocs """ def __init__(self, params, outDir): """ Constructor params: a list of city names outDir: is the output directory path """ self.outDir = outDir self.d = dict((x, City2(x, self.getOutFile(x))) for x in params) self.logger = logger("Multiple_cities") self.curCity = "" def getOutFile(self, city_name): # return the output file path for the input city return os.path.join(self.outDir, city_name + ".data") def generateTDF(self, input_files): self.bd = BigData(self.logger) self.bd.obj = self self.bdCheckCondition = self.CheckCondition self.bdDoSomething = self.DoSomething self.bd.processFiles(input_files, None) def CheckCondition(self, rec): if "user" in rec: user_data = rec["user"] if "location" in user_data: if user_data["location"] != None: loc = user_data["location"].lower() for k in self.d: if k in loc: self.curCity = k # print k + ':' + loc return True else: return False return False def DoSomething(self, rec): text = rec["text"].encode("utf-8") loc = self.curCity # print loc sentences = nltk.sent_tokenize(text) # NLTK default sentence segmenter sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer # sentences = post_tag(sentences) # print sentences for sent in sentences: # print sent # a = nltk.pos_tag(sent) for w in sent: if w in self.d[loc].tdf: self.d[loc].tdf[w] += 1 else: self.d[loc].tdf[w] = 1 # write output to outfile self.d[loc].write(dumps(rec)) def getNounsTDF(self): for k in self.d: self.d[k].getNounsTDF() def getTimeLine(self): bd = BigData(self.logger) for k in self.d: self.d[k].getTimeLine(bd)