Exemple #1
0
def processFile(filep):
    from DataMining.code.com import log, parallels  #, mongo_parallels
    import os
    from ujson import loads
    import gzip
    from redis import Redis

    c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0)
    # c = Connection('localhost')

    f = gzip.open(filep)
    logger = log.logger('Parallel/' + os.path.basename(filep))
    logger.log('finding all records with location for: ' + f.name)
    times = {}
    tot_lines = 0
    loc_lines = 0
    line = f.readline()
    while line:
        #print line
        rec = loads(line)
        tot_lines += 1
        condition = parallels.bdCheckCondition(rec)
        if condition:
            parallels.bdDoSomethingMemory(rec, times)
            loc_lines += 1
            if (loc_lines % 10000 == 0):
                logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
        line = f.readline()
    ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
    logger.send_final_stats(ret)
    return times
Exemple #2
0
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads, dumps
        import gzip
        
        outfilep = './DataMining/uncompressed/sel_cities/'+ os.path.basename(filep) + '.json'
        f = gzip.open(filep)
        logger = log.logger('Parallel/'+os.path.basename(filep))
        logger.log( 'finding all records with location for: ' + f.name)
        locs = {}
        tot_lines =0
        loc_lines =0
        line = f.readline()
        while line:
            #print line                                                                                                                                                                                                                                                                                                       
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition_keywords(rec,parallels.sel_cities)
            if condition:
                parallels.bdDoSomething_keywords(rec,locs,parallels.keywords)
                loc_lines += 1
                if (loc_lines%1000==0):
                    logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
            line = f.readline()
        ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.log('Writing json to file: ' + outfilep)
        f = open(outfilep,'wb')
        f.write(dumps(locs))
        del locs
        return ret
Exemple #3
0
def processFile(filep):
        from DataMining.code.com import log, parallels #, mongo_parallels
        import os
        from ujson import loads
        import gzip
        # from redis import Redis

        c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0)
        # c = Connection('localhost')

        f = gzip.open(filep)
        logger = log.logger('Parallel/'+os.path.basename(filep))
        logger.log( 'finding all records with location for: ' + f.name)
        times = {}
        tot_lines =0
        loc_lines =0
        line = f.readline()
        while line:
            #print line                                                                                               
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                parallels.bdDoSomethingMemory(rec, times)
                loc_lines += 1
                if (loc_lines%10000==0):
                    logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
            line = f.readline()
        ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
        return times
Exemple #4
0
def processFile(filep):
        from DataMining.code.com import log, parallels #, mongo_parallels
        import os
        from ujson import loads
        import gzip
        from pymongo import Connection
        
        try:
            c = Connection('localhost')
            db = c['tweets']


            f = gzip.open(filep)
            logger = log.logger('Parallel/'+os.path.basename(filep))
            logger.log( 'finding all records with location for: ' + f.name)
            locs = {}
            tot_lines =0
            loc_lines =0
            line = f.readline()
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    parallels.bdDoSomething2(rec, db, filep)
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()
            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
        return locs
Exemple #5
0
def processLocs():
    print 'Starting all locations search:'
    logo = logger('AllLocs')
    bd = BigData(logo, status_line_count=10000)
    bd.obj = ka
    #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None)
    bd.processFiles(BigData.GetInputFiles(input_dir), None)
    return ka
def processLocs():
    print 'Starting keyword search:'
    logo = logger('Keywords')
    bd = BigData(logo)
    bd.obj = ka
    #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None)
    bd.processFiles(BigData.GetInputFiles(input_dir), None)
    return ka
Exemple #7
0
 def __init__(self, params, outDir):
     '''
     Constructor
     params: a list of city names
     outDir: is the output directory path 
     '''
     self.outDir = outDir
     self.d = dict((x,City2(x, self.getOutFile(x))) for x in params)
     self.logger = logger('Multiple_cities')
     self.curCity = ''
Exemple #8
0
 def __init__(self, params, outDir):
     """
     Constructor
     params: a list of city names
     outDir: is the output directory path 
     """
     self.outDir = outDir
     self.d = dict((x, City2(x, self.getOutFile(x))) for x in params)
     self.logger = logger("Multiple_cities")
     self.curCity = ""
def processFile(filep):
    from DataMining.code.com import log, parallels
    import os
    from ujson import loads, dumps
    import gzip

    logger = log.logger('Parallel/' + 'sampleCreate_' +
                        os.path.basename(filep))

    ret = {}

    try:
        f = gzip.open(filep)
        tot_lines = 0
        loc_lines = 0
        line = f.readline()
        logger.log('finding all records with location for: ' + f.name)
        outf = open(
            './DataMining/sample_data/' + os.path.basename(filep) +
            '_10000.sample', 'wb')
        while line:
            #print line
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                # write rec to outfile
                outf.write(dumps(rec) + '\n')
                loc_lines += 1
                if (loc_lines % 10000 == 0):
                    break
                    logger.log('Count:' + str(loc_lines) + '/' +
                               str(tot_lines))
            line = f.readline()

        ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.send_final_stats(ret)
        outf.close()
    except Exception as e:
        logger.log('Error log: ' + str(e))
    return ret
Exemple #10
0
def processFile(filep):
    from DataMining.code.com import log, parallels
    import os
    from ujson import loads
    import gzip
    locs = {}
    logger = log.logger('Parallel/AllLocsBigData_' + os.path.basename(filep))

    try:
        f = gzip.open(filep)
        # f = open(filep)
        logger.log('finding all records with location for: ' + f.name)
        tot_lines = 0
        loc_lines = 0
        line = f.readline()
        while line:
            #print line
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                parallels.bdDoSomethingMemory(rec, locs)
                loc_lines += 1
                if (loc_lines % 10000 == 0):
                    logger.log('Count:' + str(loc_lines) + '/' +
                               str(tot_lines))
            line = f.readline()
        ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.send_final_stats(ret)
    except Exception as e:
        logger.log('Error log: ' + str(e))
    # send the results to mongodb
    # logger.log('Sending to _ now..')
    # try:
    #     helpers.write_all_locs_to_file('',[locs])
    # except Exception as e:
    #     logger.log('Error log: ' + str(e))
    return locs
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads,dumps
        import gzip


        logger = log.logger('Parallel/'+'sampleCreate_'+os.path.basename(filep))
        
        ret = {}
            
        try:
            f = gzip.open(filep)
            tot_lines =0
            loc_lines =0
            line = f.readline()
            logger.log( 'finding all records with location for: ' + f.name)
            outf = open('./DataMining/sample_data/'+os.path.basename(filep)+'_10000.sample', 'wb')
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    # write rec to outfile
                    outf.write(dumps(rec)+'\n')
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        break 
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()

            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
            outf.close()
        except Exception as e:
            logger.log('Error log: ' + str(e))
        return ret
Exemple #12
0
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads
        import gzip
        locs = {}
        logger = log.logger('Parallel/AllLocsBigData_'+os.path.basename(filep))
        
        try:
            f = gzip.open(filep)
            # f = open(filep)
            logger.log( 'finding all records with location for: ' + f.name)
            tot_lines =0
            loc_lines =0
            line = f.readline()
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    parallels.bdDoSomethingMemory(rec,locs)
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()
            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
        except Exception as e:
            logger.log('Error log: ' + str(e))
        # send the results to mongodb
        # logger.log('Sending to _ now..')
        # try:
        #     helpers.write_all_locs_to_file('',[locs])
        # except Exception as e:
        #     logger.log('Error log: ' + str(e))
        return locs
Exemple #13
0
@author: gparuthi
'''
from DataMining.code.com.log import logger
from DataMining.code.com.BigData import BigData
from DataMining.code.com.city import City
from DataMining.code.com.cities import Cities
import os

CITY_NAME = 'london'

params = { 
              'input_dir_path': '/Users/gaurav/Documents/Work/Projects/DataMining/data/',
              'out_file_path': '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'+CITY_NAME+'/'+CITY_NAME+'.data',
              'timeline_path':'/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'+CITY_NAME+'/'+CITY_NAME+'.timeline.json',
              'logger' : logger('OneCity')  
              }

def GetInputFiles(dir):
    paths = []
    for f in os.listdir(dir):
        paths.append(os.path.join(dir,f))        
    return paths

def start(params):
    # crawl each data file and get data for the given location
    # store the data in the output file    
    bd = BigData(params)
    city = City(CITY_NAME,bd,params['out_file_path'])
    input_files = bd.GetInputFiles(params['input_dir_path'])
    # Generate the tdf for the city
import gzip
from ujson import loads,dumps
from datetime import datetime
import os
from pprint import pprint
from DataMining.code.com.log import logger
from dateutil.parser import parse

logo = logger('BigDataLocations')
#LOGFILE_PATH = '/Users/gaurav/Documents/Work/Projects/DataMining/logs/' + 'BigData.'+str(datetime.now())+'.log'
#LOGFILE = open(LOGFILE_PATH,'wb')

def log(log_str):
    logo.log(log_str)
#    print str(log_str)
#    LOGFILE.write(str(log_str) + '\n')

def log_final_stats(res):
    # res is an array of arrays
    # the element array is a list of format: '['filename', totlines, loc_lines ]'
    log ('----------------------------------------------------------------------')
    log ('Final results:' + str(res))
    tot_lines = 0
    loc_lines = 0
    for r in res:
        tot_lines += r[1]
        loc_lines += r[2]
    log ('Total Lines found: ' + str (tot_lines))
    log ('Total lines with coordinates: ' +  str(loc_lines))
    log ('----------------------------------------------------------------------')
    LOGFILE.close()
Exemple #15
0
from DataMining.code.com.cities import Cities
import os

CITY_NAME = 'london'

params = {
    'input_dir_path':
    '/Users/gaurav/Documents/Work/Projects/DataMining/data/',
    'out_file_path':
    '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'
    + CITY_NAME + '/' + CITY_NAME + '.data',
    'timeline_path':
    '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'
    + CITY_NAME + '/' + CITY_NAME + '.timeline.json',
    'logger':
    logger('OneCity')
}


def GetInputFiles(dir):
    paths = []
    for f in os.listdir(dir):
        paths.append(os.path.join(dir, f))
    return paths


def start(params):
    # crawl each data file and get data for the given location
    # store the data in the output file
    bd = BigData(params)
    city = City(CITY_NAME, bd, params['out_file_path'])