def wait_for_data(wait_for_config, options): """Wait for data before kicking off hive jobs""" # Step 1 - read meta data. hive_mysql_connector.configure(options.hive_masternode, options.ssh_keyfile) if options.hive_init: hive_mysql_connector.run_hive_init() # Step 2 - wait for all the data partitions are available boto_util.initialize_creds_from_file() s3conn = boto.connect_s3() s3bucket = s3conn.get_bucket('ka-mapreduce') max_wait = datetime.timedelta(hours=options.max_wait) start = datetime.datetime.now() for d in wait_for_config: table = d['table_name'] table_location = hive_mysql_connector.get_table_location(table) for p in d['partitions']: partition_location = table_location + '/' + p #TODO(yunfang): abstract the following to wait_for_partition # for boto_util while True: if partition_available(s3bucket, partition_location): g_logger.info("%s is available" % (partition_location)) break if (datetime.datetime.now() - start) > max_wait: # Wait for a long time already. Can't wait any more g_logger.fatal("Wait for too long. " "Data is still not available." "Exiting...") sys.exit(1) # Wait for a minute to check again g_logger.info("Waiting for %s to be available... " % ( partition_location)) time.sleep(60)
def main(table_location, score_type, options): boto_util.initialize_creds_from_file() # TODO(benkomalo): factor some of this boilerplate out to boto_util # Open our input connections. s3conn = boto.connect_s3() bucket = s3conn.get_bucket('ka-mapreduce') s3keys = bucket.list( prefix=table_location[len('s3://ka-mapreduce/'):] + '/') # Mapping of video key to the info on the other videos which best match # that video # vid_key -> VideoInfo video_infos = {} # Note: a table's data may be broken down into multiple files on disk. delimiter = '\01' lines_read = 0 for key in s3keys: if key.name.endswith('_$.folder$'): # S3 meta data - not useful. continue contents = key.get_contents_as_string() for line in contents.split('\n'): if not line: # EOF break lines_read += 1 parts = line.rstrip().split(delimiter) if len(parts) != 3: # TODO(benkomalo): error handling continue vid1_key, vid2_key, score = parts try: score = float(score) except ValueError: # Some of the values were invalid - deal with it. # TODO(benkomalo): error handling. continue add_score(video_infos, vid1_key, vid2_key, score) if lines_read % 1000 == 0: print "Read %s lines..." % lines_read total_pairs = sum([len(info.best_matches) for info in video_infos.values()]) print "\nSummary of collected data:" print "\tScore type: [%s]" % score_type print ("\tDetected %d videos, with a total of %d video pair data" % (len(video_infos), total_pairs)) print "Target: %s" % oauth_util.consts.SERVER_URL if raw_input("Proceed to upload? [Y/n]: ").lower() in ['', 'y', 'yes']: upload_to_gae(score_type, video_infos)
def main(table_location, target_db, target_collection, column_info, partition_cols, options): boto_util.initialize_creds_from_file() # TODO(benkomalo): handle other formats? It may also be compressed, so # we may have to do more work. # Default row format for the tables are delimited by this control char. # when stored on disk. delimiter = '\01' key_index = options.key_index num_cols = len(column_info) col_names, col_types, _ = zip(*column_info) if key_index >= num_cols: raise Exception("Invalid key index (there aren't enough columns)") base_partition_values = {} for partition in partition_cols: key, value = partition.split('=') base_partition_values[key] = value table_location += _format_path_from_partition(partition_cols) # Open our target db connection mongo_conn = pymongo.Connection(options.report_db_host, port=options.report_db_port) mongodb = mongo_conn[target_db] mongo_collection = mongodb[target_collection] # If drop flag was set, nuke any pre-existing data if options.drop: print "\nDropping existing data in collection %s." % target_collection # Even though the option is called 'drop', I am instead # calling 'remove', because it leaves any indexes while deleting data. mongo_collection.remove() # Open our input connections. s3conn = boto.connect_s3() bucket = s3conn.get_bucket('ka-mapreduce') path_prefix = table_location[len('s3://ka-mapreduce/'):] + '/' s3keys = bucket.list(prefix=path_prefix) # TODO(benkomalo): add a flag to bail on any errors so no partial data is # saved? # Counts of rows saved, and rows with errors. saved = 0 errors = 0 NULL_STRING = '\N' docs_to_insert = [] batch_size = max(1, options.batch_size) lines_parsed = 0 # Note: a table's data may be broken down into multiple files on disk. for key in s3keys: if key.name.endswith('_$folder$'): # S3 meta data - not useful. continue # Find out if files are under additional partition "directories" # e.g. "start_dt=2012-01-01/end_dt=2012-02-01/some_file" base_path = key.name[len(path_prefix):] path_parts = base_path.split('/') partition_values = base_partition_values.copy() for partition in path_parts[:-1]: name, value = partition.split('=') partition_values[name] = value contents = key.get_contents_as_string() for line in contents.split('\n'): if not line: # EOF break # HACK: some files are '\t' delimited? Switch delimiters. # TODO(benkomalo): read format from metadata in Hive master? if delimiter not in line: delimiter = '\t' parts = line.strip().split(delimiter) if len(parts) != num_cols: # TODO(benkomalo): properly handle? shouldn't happen though. sys.stderr.write( "Unexpected number of columns in row (expected [%s]):\n" % num_cols) print >> sys.stderr, parts continue doc = {} for i, (name, type) in enumerate(zip(col_names, col_types)): # TODO(benkomalo): deal with other types and possible UTF-8 # issues? try: if type == 'int': value = int(parts[i]) elif type == 'boolean': value = parts[i] == 'true' elif type in ['double', 'float']: value = float(parts[i]) else: value = parts[i] except Exception: if parts[i] == NULL_STRING: # TODO(benkomalo): figure out why our data has this. # It seems that sometimes Hive likes to put in # NULL values for ints and booleans? They don't parse # well. This is unfortunate - just skip the row since # it's pretty rare for now. doc = None break raise doc[name] = value if key_index > -1: # mongo primary keys are labelled as "_id" doc['_id'] = parts[key_index] if doc: doc.update(partition_values) # If we have doc IDs, perform a single upsert because bulk # upserts are not currently supported by pymongo according to # this old answer: http://stackoverflow.com/questions/5292370 if '_id' in doc: mongo_collection.save(doc) saved += 1 else: docs_to_insert.append(doc) else: errors += 1 # Bulk insert docs in batches if len(docs_to_insert) >= batch_size: mongo_collection.insert(docs_to_insert) saved += len(docs_to_insert) docs_to_insert = [] if lines_parsed % 100 == 0: print "\rSaved %s docs with %s errors..." % (saved, errors), sys.stdout.flush() lines_parsed += 1 if docs_to_insert: mongo_collection.insert(docs_to_insert) saved += len(docs_to_insert) print "\nSummary of results:" print "\tSaved [%s] documents" % saved print "\tSkipped [%s] documents with errors" % errors
try: hipchat.config.init_cfg("hipchat.cfg") except Exception, e: # No hipchat.cfg file found - the token will be empty and handled below. pass if not hipchat.config.token: print >> sys.stderr, ( "Can't find HipChat token. Make a hipchat.cfg file " + 'with a single line "token = <token_value>" ' + "(don't forget to chmod 600) either in this directory " + "or in your $HOME directory" ) sys.exit(-1) boto_util.initialize_creds_from_file() def popen_results(args): proc = subprocess.Popen(args, stdout=subprocess.PIPE) return proc.communicate()[0] def parse_git_version(): return popen_results(["git", "rev-parse", "HEAD"]).strip() def parse_git_message(): return popen_results(["git", "show", "-s", "--pretty=format:%s"]).strip()
def main(table_location, score_type, options): boto_util.initialize_creds_from_file() # TODO(benkomalo): factor some of this boilerplate out to boto_util # Open our input connections. s3conn = boto.connect_s3() bucket = s3conn.get_bucket('ka-mapreduce') path = table_location[len('s3://ka-mapreduce/'):] if not path.endswith('/'): path = path + '/' s3keys = bucket.list(path) # Mapping of video key to the info on the other videos which best match # that video # vid_key -> VideoInfo video_infos = {} # Note: a table's data may be broken down into multiple files on disk. delimiter = '\01' lines_read = 0 version = None # Use a datestamp as a version. for key in s3keys: if key.name.endswith('_$.folder$'): # S3 meta data - not useful. continue contents = key.get_contents_as_string() version = max(version, key.last_modified) for line in contents.split('\n'): if not line: # EOF break lines_read += 1 parts = line.rstrip().split(delimiter) if len(parts) != 3: # TODO(benkomalo): error handling continue vid1_key, vid2_key, score = parts try: score = float(score) except ValueError: # Some of the values were invalid - deal with it. # TODO(benkomalo): error handling. continue add_score(video_infos, vid1_key, vid2_key, score) if lines_read % 1000 == 0: print "Read %s lines..." % lines_read # Convert version datestamp to a more sane ISO8601 from RFC822 version = rfc822.parsedate_tz(version)[:6] # extract YMDHMS from tuple version = datetime.datetime(*version).isoformat() total_pairs = sum( [len(info.best_matches) for info in video_infos.values()]) print "\nSummary of collected data:" print "\tScore type: [%s]" % score_type print "\tVersion: [%s]" % version print("\tDetected %d videos, with a total of %d video pair data" % (len(video_infos), total_pairs)) print "Target: %s" % oauth_util.consts.SERVER_URL if raw_input("Proceed to upload? [Y/n]: ").lower() in ['', 'y', 'yes']: upload_to_gae(score_type, version, video_infos) print "Success!" print "Run the following to make it live:" print "set_video_matrix_version.py '%s' '%s'" % (score_type, version)
def main(table_location, score_type, options): boto_util.initialize_creds_from_file() # TODO(benkomalo): factor some of this boilerplate out to boto_util # Open our input connections. s3conn = boto.connect_s3() bucket = s3conn.get_bucket('ka-mapreduce') path = table_location[len('s3://ka-mapreduce/'):] if not path.endswith('/'): path = path + '/' s3keys = bucket.list(path) # Mapping of video key to the info on the other videos which best match # that video # vid_key -> VideoInfo video_infos = {} # Note: a table's data may be broken down into multiple files on disk. delimiter = '\01' lines_read = 0 version = None # Use a datestamp as a version. for key in s3keys: if key.name.endswith('_$.folder$'): # S3 meta data - not useful. continue contents = key.get_contents_as_string() version = max(version, key.last_modified) for line in contents.split('\n'): if not line: # EOF break lines_read += 1 parts = line.rstrip().split(delimiter) if len(parts) != 3: # TODO(benkomalo): error handling continue vid1_key, vid2_key, score = parts try: score = float(score) except ValueError: # Some of the values were invalid - deal with it. # TODO(benkomalo): error handling. continue add_score(video_infos, vid1_key, vid2_key, score) if lines_read % 1000 == 0: print "Read %s lines..." % lines_read # Convert version datestamp to a more sane ISO8601 from RFC822 version = rfc822.parsedate_tz(version)[:6] # extract YMDHMS from tuple version = datetime.datetime(*version).isoformat() total_pairs = sum([len(info.best_matches) for info in video_infos.values()]) print "\nSummary of collected data:" print "\tScore type: [%s]" % score_type print "\tVersion: [%s]" % version print ("\tDetected %d videos, with a total of %d video pair data" % (len(video_infos), total_pairs)) print "Target: %s" % oauth_util.consts.SERVER_URL if raw_input("Proceed to upload? [Y/n]: ").lower() in ['', 'y', 'yes']: upload_to_gae(score_type, version, video_infos) print "Success!" print "Run the following to make it live:" print "set_video_matrix_version.py '%s' '%s'" % ( score_type, version)
def main(table_location, target_db, target_collection, column_info, partition_cols, options): boto_util.initialize_creds_from_file() # TODO(benkomalo): handle other formats? It may also be compressed, so # we may have to do more work. # Default row format for the tables are delimited by this control char. # when stored on disk. delimiter = '\01' key_index = options.key_index num_cols = len(column_info) col_names, col_types, _ = zip(*column_info) if key_index >= num_cols: raise Exception("Invalid key index (there aren't enough columns)") base_partition_values = {} for partition in partition_cols: key, value = partition.split('=') base_partition_values[key] = value table_location += _format_path_from_partition(partition_cols) # Open our target db connection mongo_conn = pymongo.Connection(options.report_db_host, port=options.report_db_port) mongodb = mongo_conn[target_db] mongo_collection = mongodb[target_collection] # If drop flag was set, nuke any pre-existing data if options.drop: print "\nDropping existing data in collection %s." % target_collection # Even though the option is called 'drop', I am instead # calling 'remove', because it leaves any indexes while deleting data. mongo_collection.remove() elif options.drop_partition: print( "\nDropping existing data in collection {0} " "on partition spec {1}".format(target_collection, partition_cols)) mongo_collection.remove(base_partition_values) # Open our input connections. s3conn = boto.connect_s3() bucket = s3conn.get_bucket('ka-mapreduce') path_prefix = table_location[len('s3://ka-mapreduce/'):] + '/' s3keys = bucket.list(prefix=path_prefix) # TODO(benkomalo): add a flag to bail on any errors so no partial data is # saved? # Counts of rows saved, and rows with errors. saved = 0 errors = 0 NULL_STRING = '\N' docs_to_insert = [] batch_size = max(1, options.batch_size) lines_parsed = 0 # Note: a table's data may be broken down into multiple files on disk. for key in s3keys: if key.name.endswith('_$folder$'): # S3 meta data - not useful. continue # Find out if files are under additional partition "directories" # e.g. "start_dt=2012-01-01/end_dt=2012-02-01/some_file" base_path = key.name[len(path_prefix):] path_parts = base_path.split('/') partition_values = base_partition_values.copy() for partition in path_parts[:-1]: name, value = partition.split('=') partition_values[name] = value contents = key.get_contents_as_string() for line in contents.split('\n'): if not line: # EOF break # HACK: some files are '\t' delimited? Switch delimiters. # TODO(benkomalo): read format from metadata in Hive master? if delimiter not in line: delimiter = '\t' parts = line.strip().split(delimiter) if len(parts) != num_cols: # TODO(benkomalo): properly handle? shouldn't happen though. sys.stderr.write( "Unexpected number of columns in row (expected [%s]):\n" % num_cols) print >> sys.stderr, parts continue doc = {} for i, (name, type) in enumerate(zip(col_names, col_types)): # TODO(benkomalo): deal with other types and possible UTF-8 # issues? try: if type == 'int': value = int(parts[i]) elif type == 'boolean': value = parts[i] == 'true' elif type in ['double', 'float']: value = float(parts[i]) else: value = parts[i] except ValueError: if parts[i] == NULL_STRING: # Some queries (especially OUTER JOINS) can result in # null values in non string type fields. # In case the person who wrote the query does not # replace resultant NULLs with default values # the importer should still be able to copy the table value = None else: doc = None break doc[name] = value if key_index > -1 and doc: # mongo primary keys are labelled as "_id" doc['_id'] = parts[key_index] if doc: doc.update(partition_values) # If we have doc IDs, perform a single upsert because bulk # upserts are not currently supported by pymongo according to # this old answer: http://stackoverflow.com/questions/5292370 if '_id' in doc: mongo_collection.save(doc) saved += 1 else: docs_to_insert.append(doc) else: errors += 1 # Bulk insert docs in batches if len(docs_to_insert) >= batch_size: mongo_collection.insert(docs_to_insert) saved += len(docs_to_insert) docs_to_insert = [] if lines_parsed % 100 == 0: print "\rSaved %s docs with %s errors..." % (saved, errors), sys.stdout.flush() lines_parsed += 1 if docs_to_insert: mongo_collection.insert(docs_to_insert) saved += len(docs_to_insert) print "\nSummary of results:" print "\tSaved [%s] documents" % saved print "\tSkipped [%s] documents with errors" % errors
"python deploy.py geoip" For even more information about GeoIP refer to Makefile in this directory """ import optparse import os import subprocess import sys import boto sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src')) import boto_util import notify boto_util.initialize_creds_from_file() def popen_results(args): proc = subprocess.Popen(args, stdout=subprocess.PIPE) return proc.communicate()[0] def parse_git_version(): return popen_results(['git', 'rev-parse', 'HEAD']).strip() def parse_git_message(): return popen_results(['git', 'show', '-s', '--pretty=format:%s']).strip()