Beispiel #1
0
def processFile(filep):
    from DataMining.code.com import log, parallels  #, mongo_parallels
    import os
    from ujson import loads
    import gzip
    from redis import Redis

    c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0)
    # c = Connection('localhost')

    f = gzip.open(filep)
    logger = log.logger('Parallel/' + os.path.basename(filep))
    logger.log('finding all records with location for: ' + f.name)
    times = {}
    tot_lines = 0
    loc_lines = 0
    line = f.readline()
    while line:
        #print line
        rec = loads(line)
        tot_lines += 1
        condition = parallels.bdCheckCondition(rec)
        if condition:
            parallels.bdDoSomethingMemory(rec, times)
            loc_lines += 1
            if (loc_lines % 10000 == 0):
                logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
        line = f.readline()
    ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
    logger.send_final_stats(ret)
    return times
Beispiel #2
0
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads, dumps
        import gzip
        
        outfilep = './DataMining/uncompressed/sel_cities/'+ os.path.basename(filep) + '.json'
        f = gzip.open(filep)
        logger = log.logger('Parallel/'+os.path.basename(filep))
        logger.log( 'finding all records with location for: ' + f.name)
        locs = {}
        tot_lines =0
        loc_lines =0
        line = f.readline()
        while line:
            #print line                                                                                                                                                                                                                                                                                                       
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition_keywords(rec,parallels.sel_cities)
            if condition:
                parallels.bdDoSomething_keywords(rec,locs,parallels.keywords)
                loc_lines += 1
                if (loc_lines%1000==0):
                    logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
            line = f.readline()
        ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.log('Writing json to file: ' + outfilep)
        f = open(outfilep,'wb')
        f.write(dumps(locs))
        del locs
        return ret
Beispiel #3
0
def processFile(filep):
        from DataMining.code.com import log, parallels #, mongo_parallels
        import os
        from ujson import loads
        import gzip
        # from redis import Redis

        c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0)
        # c = Connection('localhost')

        f = gzip.open(filep)
        logger = log.logger('Parallel/'+os.path.basename(filep))
        logger.log( 'finding all records with location for: ' + f.name)
        times = {}
        tot_lines =0
        loc_lines =0
        line = f.readline()
        while line:
            #print line                                                                                               
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                parallels.bdDoSomethingMemory(rec, times)
                loc_lines += 1
                if (loc_lines%10000==0):
                    logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
            line = f.readline()
        ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
        return times
Beispiel #4
0
def processFile(filep):
        from DataMining.code.com import log, parallels #, mongo_parallels
        import os
        from ujson import loads
        import gzip
        from pymongo import Connection
        
        try:
            c = Connection('localhost')
            db = c['tweets']


            f = gzip.open(filep)
            logger = log.logger('Parallel/'+os.path.basename(filep))
            logger.log( 'finding all records with location for: ' + f.name)
            locs = {}
            tot_lines =0
            loc_lines =0
            line = f.readline()
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    parallels.bdDoSomething2(rec, db, filep)
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()
            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
        return locs
Beispiel #5
0
def processLocs():
    print 'Starting all locations search:'
    logo = logger('AllLocs')
    bd = BigData(logo, status_line_count=10000)
    bd.obj = ka
    #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None)
    bd.processFiles(BigData.GetInputFiles(input_dir), None)
    return ka
def processLocs():
    print 'Starting keyword search:'
    logo = logger('Keywords')
    bd = BigData(logo)
    bd.obj = ka
    #bd.processFile(open('/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/ny_11_1_to_11_15.data'), None)
    bd.processFiles(BigData.GetInputFiles(input_dir), None)
    return ka
Beispiel #7
0
 def __init__(self, params, outDir):
     '''
     Constructor
     params: a list of city names
     outDir: is the output directory path 
     '''
     self.outDir = outDir
     self.d = dict((x,City2(x, self.getOutFile(x))) for x in params)
     self.logger = logger('Multiple_cities')
     self.curCity = ''
Beispiel #8
0
 def __init__(self, params, outDir):
     """
     Constructor
     params: a list of city names
     outDir: is the output directory path 
     """
     self.outDir = outDir
     self.d = dict((x, City2(x, self.getOutFile(x))) for x in params)
     self.logger = logger("Multiple_cities")
     self.curCity = ""
Beispiel #9
0
def processFile(filep):
    from DataMining.code.com import log, parallels
    import os
    from ujson import loads, dumps
    import gzip

    logger = log.logger('Parallel/' + 'sampleCreate_' +
                        os.path.basename(filep))

    ret = {}

    try:
        f = gzip.open(filep)
        tot_lines = 0
        loc_lines = 0
        line = f.readline()
        logger.log('finding all records with location for: ' + f.name)
        outf = open(
            './DataMining/sample_data/' + os.path.basename(filep) +
            '_10000.sample', 'wb')
        while line:
            #print line
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                # write rec to outfile
                outf.write(dumps(rec) + '\n')
                loc_lines += 1
                if (loc_lines % 10000 == 0):
                    break
                    logger.log('Count:' + str(loc_lines) + '/' +
                               str(tot_lines))
            line = f.readline()

        ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.send_final_stats(ret)
        outf.close()
    except Exception as e:
        logger.log('Error log: ' + str(e))
    return ret
Beispiel #10
0
def processFile(filep):
    from DataMining.code.com import log, parallels
    import os
    from ujson import loads
    import gzip
    locs = {}
    logger = log.logger('Parallel/AllLocsBigData_' + os.path.basename(filep))

    try:
        f = gzip.open(filep)
        # f = open(filep)
        logger.log('finding all records with location for: ' + f.name)
        tot_lines = 0
        loc_lines = 0
        line = f.readline()
        while line:
            #print line
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                parallels.bdDoSomethingMemory(rec, locs)
                loc_lines += 1
                if (loc_lines % 10000 == 0):
                    logger.log('Count:' + str(loc_lines) + '/' +
                               str(tot_lines))
            line = f.readline()
        ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.send_final_stats(ret)
    except Exception as e:
        logger.log('Error log: ' + str(e))
    # send the results to mongodb
    # logger.log('Sending to _ now..')
    # try:
    #     helpers.write_all_locs_to_file('',[locs])
    # except Exception as e:
    #     logger.log('Error log: ' + str(e))
    return locs
Beispiel #11
0
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads,dumps
        import gzip


        logger = log.logger('Parallel/'+'sampleCreate_'+os.path.basename(filep))
        
        ret = {}
            
        try:
            f = gzip.open(filep)
            tot_lines =0
            loc_lines =0
            line = f.readline()
            logger.log( 'finding all records with location for: ' + f.name)
            outf = open('./DataMining/sample_data/'+os.path.basename(filep)+'_10000.sample', 'wb')
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    # write rec to outfile
                    outf.write(dumps(rec)+'\n')
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        break 
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()

            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
            outf.close()
        except Exception as e:
            logger.log('Error log: ' + str(e))
        return ret
Beispiel #12
0
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads
        import gzip
        locs = {}
        logger = log.logger('Parallel/AllLocsBigData_'+os.path.basename(filep))
        
        try:
            f = gzip.open(filep)
            # f = open(filep)
            logger.log( 'finding all records with location for: ' + f.name)
            tot_lines =0
            loc_lines =0
            line = f.readline()
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    parallels.bdDoSomethingMemory(rec,locs)
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()
            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
        except Exception as e:
            logger.log('Error log: ' + str(e))
        # send the results to mongodb
        # logger.log('Sending to _ now..')
        # try:
        #     helpers.write_all_locs_to_file('',[locs])
        # except Exception as e:
        #     logger.log('Error log: ' + str(e))
        return locs
Beispiel #13
0
@author: gparuthi
'''
from DataMining.code.com.log import logger
from DataMining.code.com.BigData import BigData
from DataMining.code.com.city import City
from DataMining.code.com.cities import Cities
import os

CITY_NAME = 'london'

params = { 
              'input_dir_path': '/Users/gaurav/Documents/Work/Projects/DataMining/data/',
              'out_file_path': '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'+CITY_NAME+'/'+CITY_NAME+'.data',
              'timeline_path':'/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'+CITY_NAME+'/'+CITY_NAME+'.timeline.json',
              'logger' : logger('OneCity')  
              }

def GetInputFiles(dir):
    paths = []
    for f in os.listdir(dir):
        paths.append(os.path.join(dir,f))        
    return paths

def start(params):
    # crawl each data file and get data for the given location
    # store the data in the output file    
    bd = BigData(params)
    city = City(CITY_NAME,bd,params['out_file_path'])
    input_files = bd.GetInputFiles(params['input_dir_path'])
    # Generate the tdf for the city
Beispiel #14
0
import gzip
from ujson import loads,dumps
from datetime import datetime
import os
from pprint import pprint
from DataMining.code.com.log import logger
from dateutil.parser import parse

logo = logger('BigDataLocations')
#LOGFILE_PATH = '/Users/gaurav/Documents/Work/Projects/DataMining/logs/' + 'BigData.'+str(datetime.now())+'.log'
#LOGFILE = open(LOGFILE_PATH,'wb')

def log(log_str):
    logo.log(log_str)
#    print str(log_str)
#    LOGFILE.write(str(log_str) + '\n')

def log_final_stats(res):
    # res is an array of arrays
    # the element array is a list of format: '['filename', totlines, loc_lines ]'
    log ('----------------------------------------------------------------------')
    log ('Final results:' + str(res))
    tot_lines = 0
    loc_lines = 0
    for r in res:
        tot_lines += r[1]
        loc_lines += r[2]
    log ('Total Lines found: ' + str (tot_lines))
    log ('Total lines with coordinates: ' +  str(loc_lines))
    log ('----------------------------------------------------------------------')
    LOGFILE.close()
Beispiel #15
0
from DataMining.code.com.cities import Cities
import os

CITY_NAME = 'london'

params = {
    'input_dir_path':
    '/Users/gaurav/Documents/Work/Projects/DataMining/data/',
    'out_file_path':
    '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'
    + CITY_NAME + '/' + CITY_NAME + '.data',
    'timeline_path':
    '/Users/gaurav/Documents/Work/Projects/DataMining/uncompressed/locations_cities/'
    + CITY_NAME + '/' + CITY_NAME + '.timeline.json',
    'logger':
    logger('OneCity')
}


def GetInputFiles(dir):
    paths = []
    for f in os.listdir(dir):
        paths.append(os.path.join(dir, f))
    return paths


def start(params):
    # crawl each data file and get data for the given location
    # store the data in the output file
    bd = BigData(params)
    city = City(CITY_NAME, bd, params['out_file_path'])