Exemple #1
0
def processFile(filep):
        from DataMining.code.com import log, parallels #, mongo_parallels
        import os
        from ujson import loads
        import gzip
        from pymongo import Connection
        
        try:
            c = Connection('localhost')
            db = c['tweets']


            f = gzip.open(filep)
            logger = log.logger('Parallel/'+os.path.basename(filep))
            logger.log( 'finding all records with location for: ' + f.name)
            locs = {}
            tot_lines =0
            loc_lines =0
            line = f.readline()
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    parallels.bdDoSomething2(rec, db, filep)
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()
            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
        return locs
Exemple #2
0
def processFile(filep):
    from DataMining.code.com import log, parallels  #, mongo_parallels
    import os
    from ujson import loads
    import gzip
    from redis import Redis

    c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0)
    # c = Connection('localhost')

    f = gzip.open(filep)
    logger = log.logger('Parallel/' + os.path.basename(filep))
    logger.log('finding all records with location for: ' + f.name)
    times = {}
    tot_lines = 0
    loc_lines = 0
    line = f.readline()
    while line:
        #print line
        rec = loads(line)
        tot_lines += 1
        condition = parallels.bdCheckCondition(rec)
        if condition:
            parallels.bdDoSomethingMemory(rec, times)
            loc_lines += 1
            if (loc_lines % 10000 == 0):
                logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
        line = f.readline()
    ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
    logger.send_final_stats(ret)
    return times
Exemple #3
0
def processFile(filep):
        from DataMining.code.com import log, parallels #, mongo_parallels
        import os
        from ujson import loads
        import gzip
        # from redis import Redis

        c = redis.Redis(host='dhcp2-240.si.umich.edu', port=6379, db=0)
        # c = Connection('localhost')

        f = gzip.open(filep)
        logger = log.logger('Parallel/'+os.path.basename(filep))
        logger.log( 'finding all records with location for: ' + f.name)
        times = {}
        tot_lines =0
        loc_lines =0
        line = f.readline()
        while line:
            #print line                                                                                               
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                parallels.bdDoSomethingMemory(rec, times)
                loc_lines += 1
                if (loc_lines%10000==0):
                    logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
            line = f.readline()
        ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
        return times
def processFile(filep):
    from DataMining.code.com import log, parallels
    import os
    from ujson import loads, dumps
    import gzip

    logger = log.logger('Parallel/' + 'sampleCreate_' +
                        os.path.basename(filep))

    ret = {}

    try:
        f = gzip.open(filep)
        tot_lines = 0
        loc_lines = 0
        line = f.readline()
        logger.log('finding all records with location for: ' + f.name)
        outf = open(
            './DataMining/sample_data/' + os.path.basename(filep) +
            '_10000.sample', 'wb')
        while line:
            #print line
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                # write rec to outfile
                outf.write(dumps(rec) + '\n')
                loc_lines += 1
                if (loc_lines % 10000 == 0):
                    break
                    logger.log('Count:' + str(loc_lines) + '/' +
                               str(tot_lines))
            line = f.readline()

        ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.send_final_stats(ret)
        outf.close()
    except Exception as e:
        logger.log('Error log: ' + str(e))
    return ret
Exemple #5
0
def processFile(filep):
    from DataMining.code.com import log, parallels
    import os
    from ujson import loads
    import gzip
    locs = {}
    logger = log.logger('Parallel/AllLocsBigData_' + os.path.basename(filep))

    try:
        f = gzip.open(filep)
        # f = open(filep)
        logger.log('finding all records with location for: ' + f.name)
        tot_lines = 0
        loc_lines = 0
        line = f.readline()
        while line:
            #print line
            rec = loads(line)
            tot_lines += 1
            condition = parallels.bdCheckCondition(rec)
            if condition:
                parallels.bdDoSomethingMemory(rec, locs)
                loc_lines += 1
                if (loc_lines % 10000 == 0):
                    logger.log('Count:' + str(loc_lines) + '/' +
                               str(tot_lines))
            line = f.readline()
        ret = {'fname': f.name, 'tot_lines': tot_lines, 'loc_lines': loc_lines}
        logger.send_final_stats(ret)
    except Exception as e:
        logger.log('Error log: ' + str(e))
    # send the results to mongodb
    # logger.log('Sending to _ now..')
    # try:
    #     helpers.write_all_locs_to_file('',[locs])
    # except Exception as e:
    #     logger.log('Error log: ' + str(e))
    return locs
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads,dumps
        import gzip


        logger = log.logger('Parallel/'+'sampleCreate_'+os.path.basename(filep))
        
        ret = {}
            
        try:
            f = gzip.open(filep)
            tot_lines =0
            loc_lines =0
            line = f.readline()
            logger.log( 'finding all records with location for: ' + f.name)
            outf = open('./DataMining/sample_data/'+os.path.basename(filep)+'_10000.sample', 'wb')
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    # write rec to outfile
                    outf.write(dumps(rec)+'\n')
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        break 
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()

            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
            outf.close()
        except Exception as e:
            logger.log('Error log: ' + str(e))
        return ret
Exemple #7
0
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads
        import gzip
        locs = {}
        logger = log.logger('Parallel/AllLocsBigData_'+os.path.basename(filep))
        
        try:
            f = gzip.open(filep)
            # f = open(filep)
            logger.log( 'finding all records with location for: ' + f.name)
            tot_lines =0
            loc_lines =0
            line = f.readline()
            while line:
                #print line                                                                                               
                rec = loads(line)
                tot_lines += 1
                condition = parallels.bdCheckCondition(rec)
                if condition:
                    parallels.bdDoSomethingMemory(rec,locs)
                    loc_lines += 1
                    if (loc_lines%10000==0):
                        logger.log('Count:' + str(loc_lines) + '/' + str(tot_lines))
                line = f.readline()
            ret = {'fname':f.name,'tot_lines': tot_lines, 'loc_lines': loc_lines}
            logger.send_final_stats(ret)
        except Exception as e:
            logger.log('Error log: ' + str(e))
        # send the results to mongodb
        # logger.log('Sending to _ now..')
        # try:
        #     helpers.write_all_locs_to_file('',[locs])
        # except Exception as e:
        #     logger.log('Error log: ' + str(e))
        return locs