Esempio n. 1
0
def wait_for_data(wait_for_config, options):
    """Wait for data before kicking off hive jobs"""
    # Step 1 - read meta data.
    hive_mysql_connector.configure(options.hive_masternode,
        options.ssh_keyfile)

    if options.hive_init:
        hive_mysql_connector.run_hive_init()
    # Step 2 - wait for all the data partitions are available
    boto_util.initialize_creds_from_file()
    s3conn = boto.connect_s3()
    s3bucket = s3conn.get_bucket('ka-mapreduce')
    max_wait = datetime.timedelta(hours=options.max_wait)
    start = datetime.datetime.now()
    for d in wait_for_config:
        table = d['table_name']
        table_location = hive_mysql_connector.get_table_location(table)
        for p in d['partitions']:
            partition_location = table_location + '/' + p
            #TODO(yunfang): abstract the following to wait_for_partition
            #               for boto_util
            while True:
                if partition_available(s3bucket, partition_location):
                    g_logger.info("%s is available" % (partition_location))
                    break
                if (datetime.datetime.now() - start) > max_wait:
                    # Wait for a long time already. Can't wait any more
                    g_logger.fatal("Wait for too long. "
                                   "Data is still not available."
                                   "Exiting...")
                    sys.exit(1)
                # Wait for a minute to check again
                g_logger.info("Waiting for %s to be available... " % (
                              partition_location))
                time.sleep(60)
Esempio n. 2
0
def wait_for_data(wait_for_config, options):
    """Wait for data before kicking off hive jobs"""
    # Step 1 - read meta data.
    hive_mysql_connector.configure(options.hive_masternode,
        options.ssh_keyfile)

    if options.hive_init:
        hive_mysql_connector.run_hive_init()
    # Step 2 - wait for all the data partitions are available
    boto_util.initialize_creds_from_file()
    s3conn = boto.connect_s3()
    s3bucket = s3conn.get_bucket('ka-mapreduce')
    max_wait = datetime.timedelta(hours=options.max_wait)
    start = datetime.datetime.now()
    for d in wait_for_config:
        table = d['table_name']
        table_location = hive_mysql_connector.get_table_location(table)
        for p in d['partitions']:
            partition_location = table_location + '/' + p
            #TODO(yunfang): abstract the following to wait_for_partition
            #               for boto_util
            while True:
                if partition_available(s3bucket, partition_location):
                    g_logger.info("%s is available" % (partition_location))
                    break
                if (datetime.datetime.now() - start) > max_wait:
                    # Wait for a long time already. Can't wait any more
                    g_logger.fatal("Wait for too long. "
                                   "Data is still not available."
                                   "Exiting...")
                    sys.exit(1)
                # Wait for a minute to check again
                g_logger.info("Waiting for %s to be available... " % (
                              partition_location))
                time.sleep(60)
def main(table_location, score_type, options):
    boto_util.initialize_creds_from_file()

    # TODO(benkomalo): factor some of this boilerplate out to boto_util
    # Open our input connections.
    s3conn = boto.connect_s3()
    bucket = s3conn.get_bucket('ka-mapreduce')
    s3keys = bucket.list(
        prefix=table_location[len('s3://ka-mapreduce/'):] + '/')

    # Mapping of video key to the info on the other videos which best match
    # that video
    # vid_key -> VideoInfo
    video_infos = {}

    # Note: a table's data may be broken down into multiple files on disk.
    delimiter = '\01'
    lines_read = 0
    for key in s3keys:
        if key.name.endswith('_$.folder$'):
            # S3 meta data - not useful.
            continue

        contents = key.get_contents_as_string()
        for line in contents.split('\n'):

            if not line:
                # EOF
                break

            lines_read += 1
            parts = line.rstrip().split(delimiter)
            if len(parts) != 3:
                # TODO(benkomalo): error handling
                continue

            vid1_key, vid2_key, score = parts

            try:
                score = float(score)
            except ValueError:
                # Some of the values were invalid - deal with it.
                # TODO(benkomalo): error handling.
                continue

            add_score(video_infos, vid1_key, vid2_key, score)

            if lines_read % 1000 == 0:
                print "Read %s lines..." % lines_read

    total_pairs = sum([len(info.best_matches)
                       for info in video_infos.values()])
    print "\nSummary of collected data:"
    print "\tScore type: [%s]" % score_type
    print ("\tDetected %d videos, with a total of %d video pair data" %
           (len(video_infos), total_pairs))
    print "Target: %s" % oauth_util.consts.SERVER_URL
    if raw_input("Proceed to upload? [Y/n]: ").lower() in ['', 'y', 'yes']:
        upload_to_gae(score_type, video_infos)
Esempio n. 4
0
def main(table_location,
         target_db,
         target_collection,
         column_info,
         partition_cols,
         options):

    boto_util.initialize_creds_from_file()

    # TODO(benkomalo): handle other formats? It may also be compressed, so
    #    we may have to do more work.
    # Default row format for the tables are delimited by this control char.
    # when stored on disk.
    delimiter = '\01'
    key_index = options.key_index
    num_cols = len(column_info)
    col_names, col_types, _ = zip(*column_info)

    if key_index >= num_cols:
        raise Exception("Invalid key index (there aren't enough columns)")

    base_partition_values = {}
    for partition in partition_cols:
        key, value = partition.split('=')
        base_partition_values[key] = value
    table_location += _format_path_from_partition(partition_cols)

    # Open our target db connection
    mongo_conn = pymongo.Connection(options.report_db_host,
                                    port=options.report_db_port)
    mongodb = mongo_conn[target_db]
    mongo_collection = mongodb[target_collection]

    # If drop flag was set, nuke any pre-existing data
    if options.drop:
        print "\nDropping existing data in collection %s." % target_collection
        # Even though the option is called 'drop', I am instead
        # calling 'remove', because it leaves any indexes while deleting data.
        mongo_collection.remove()

    # Open our input connections.
    s3conn = boto.connect_s3()
    bucket = s3conn.get_bucket('ka-mapreduce')
    path_prefix = table_location[len('s3://ka-mapreduce/'):] + '/'
    s3keys = bucket.list(prefix=path_prefix)

    # TODO(benkomalo): add a flag to bail on any errors so no partial data is
    #    saved?
    # Counts of rows saved, and rows with errors.
    saved = 0
    errors = 0
    NULL_STRING = '\N'

    docs_to_insert = []
    batch_size = max(1, options.batch_size)

    lines_parsed = 0

    # Note: a table's data may be broken down into multiple files on disk.
    for key in s3keys:
        if key.name.endswith('_$folder$'):
            # S3 meta data - not useful.
            continue

        # Find out if files are under additional partition "directories"
        # e.g. "start_dt=2012-01-01/end_dt=2012-02-01/some_file"
        base_path = key.name[len(path_prefix):]
        path_parts = base_path.split('/')
        partition_values = base_partition_values.copy()
        for partition in path_parts[:-1]:
            name, value = partition.split('=')
            partition_values[name] = value

        contents = key.get_contents_as_string()
        for line in contents.split('\n'):

            if not line:
                # EOF
                break

            # HACK: some files are '\t' delimited? Switch delimiters.
            # TODO(benkomalo): read format from metadata in Hive master?
            if delimiter not in line:
                delimiter = '\t'

            parts = line.strip().split(delimiter)
            if len(parts) != num_cols:
                # TODO(benkomalo): properly handle? shouldn't happen though.
                sys.stderr.write(
                    "Unexpected number of columns in row (expected [%s]):\n" %
                     num_cols)
                print >> sys.stderr, parts
                continue

            doc = {}
            for i, (name, type) in enumerate(zip(col_names, col_types)):
                # TODO(benkomalo): deal with other types and possible UTF-8
                #    issues?
                try:
                    if type == 'int':
                        value = int(parts[i])
                    elif type == 'boolean':
                        value = parts[i] == 'true'
                    elif type in ['double', 'float']:
                        value = float(parts[i])
                    else:
                        value = parts[i]
                except Exception:
                    if parts[i] == NULL_STRING:
                        # TODO(benkomalo): figure out why our data has this.
                        # It seems that sometimes Hive likes to put in
                        # NULL values for ints and booleans? They don't parse
                        # well. This is unfortunate - just skip the row since
                        # it's pretty rare for now.
                        doc = None
                        break
                    raise

                doc[name] = value

            if key_index > -1:
                # mongo primary keys are labelled as "_id"
                doc['_id'] = parts[key_index]

            if doc:

                doc.update(partition_values)

                # If we have doc IDs, perform a single upsert because bulk
                # upserts are not currently supported by pymongo according to
                # this old answer: http://stackoverflow.com/questions/5292370
                if '_id' in doc:
                    mongo_collection.save(doc)
                    saved += 1
                else:
                    docs_to_insert.append(doc)

            else:
                errors += 1

            # Bulk insert docs in batches
            if len(docs_to_insert) >= batch_size:
                mongo_collection.insert(docs_to_insert)
                saved += len(docs_to_insert)
                docs_to_insert = []

            if lines_parsed % 100 == 0:
                print "\rSaved %s docs with %s errors..." % (saved, errors),
                sys.stdout.flush()

            lines_parsed += 1

    if docs_to_insert:
        mongo_collection.insert(docs_to_insert)
        saved += len(docs_to_insert)

    print "\nSummary of results:"
    print "\tSaved [%s] documents" % saved
    print "\tSkipped [%s] documents with errors" % errors
Esempio n. 5
0
try:
    hipchat.config.init_cfg("hipchat.cfg")
except Exception, e:
    # No hipchat.cfg file found - the token will be empty and handled below.
    pass

if not hipchat.config.token:
    print >> sys.stderr, (
        "Can't find HipChat token. Make a hipchat.cfg file "
        + 'with a single line "token = <token_value>" '
        + "(don't forget to chmod 600) either in this directory "
        + "or in your $HOME directory"
    )
    sys.exit(-1)
boto_util.initialize_creds_from_file()


def popen_results(args):
    proc = subprocess.Popen(args, stdout=subprocess.PIPE)
    return proc.communicate()[0]


def parse_git_version():
    return popen_results(["git", "rev-parse", "HEAD"]).strip()


def parse_git_message():
    return popen_results(["git", "show", "-s", "--pretty=format:%s"]).strip()

def main(table_location, score_type, options):
    boto_util.initialize_creds_from_file()

    # TODO(benkomalo): factor some of this boilerplate out to boto_util
    # Open our input connections.
    s3conn = boto.connect_s3()
    bucket = s3conn.get_bucket('ka-mapreduce')
    path = table_location[len('s3://ka-mapreduce/'):]
    if not path.endswith('/'):
        path = path + '/'
    s3keys = bucket.list(path)

    # Mapping of video key to the info on the other videos which best match
    # that video
    # vid_key -> VideoInfo
    video_infos = {}

    # Note: a table's data may be broken down into multiple files on disk.
    delimiter = '\01'
    lines_read = 0
    version = None  # Use a datestamp as a version.
    for key in s3keys:
        if key.name.endswith('_$.folder$'):
            # S3 meta data - not useful.
            continue

        contents = key.get_contents_as_string()
        version = max(version, key.last_modified)
        for line in contents.split('\n'):

            if not line:
                # EOF
                break

            lines_read += 1
            parts = line.rstrip().split(delimiter)
            if len(parts) != 3:
                # TODO(benkomalo): error handling
                continue

            vid1_key, vid2_key, score = parts

            try:
                score = float(score)
            except ValueError:
                # Some of the values were invalid - deal with it.
                # TODO(benkomalo): error handling.
                continue

            add_score(video_infos, vid1_key, vid2_key, score)

            if lines_read % 1000 == 0:
                print "Read %s lines..." % lines_read

    # Convert version datestamp to a more sane ISO8601 from RFC822
    version = rfc822.parsedate_tz(version)[:6]  # extract YMDHMS from tuple
    version = datetime.datetime(*version).isoformat()

    total_pairs = sum(
        [len(info.best_matches) for info in video_infos.values()])
    print "\nSummary of collected data:"
    print "\tScore type: [%s]" % score_type
    print "\tVersion: [%s]" % version
    print("\tDetected %d videos, with a total of %d video pair data" %
          (len(video_infos), total_pairs))
    print "Target: %s" % oauth_util.consts.SERVER_URL
    if raw_input("Proceed to upload? [Y/n]: ").lower() in ['', 'y', 'yes']:
        upload_to_gae(score_type, version, video_infos)
        print "Success!"
        print "Run the following to make it live:"
        print "set_video_matrix_version.py '%s' '%s'" % (score_type, version)
Esempio n. 7
0
def main(table_location, score_type, options):
    boto_util.initialize_creds_from_file()

    # TODO(benkomalo): factor some of this boilerplate out to boto_util
    # Open our input connections.
    s3conn = boto.connect_s3()
    bucket = s3conn.get_bucket('ka-mapreduce')
    path = table_location[len('s3://ka-mapreduce/'):]
    if not path.endswith('/'):
        path = path + '/'
    s3keys = bucket.list(path)

    # Mapping of video key to the info on the other videos which best match
    # that video
    # vid_key -> VideoInfo
    video_infos = {}

    # Note: a table's data may be broken down into multiple files on disk.
    delimiter = '\01'
    lines_read = 0
    version = None  # Use a datestamp as a version.
    for key in s3keys:
        if key.name.endswith('_$.folder$'):
            # S3 meta data - not useful.
            continue

        contents = key.get_contents_as_string()
        version = max(version, key.last_modified)
        for line in contents.split('\n'):

            if not line:
                # EOF
                break

            lines_read += 1
            parts = line.rstrip().split(delimiter)
            if len(parts) != 3:
                # TODO(benkomalo): error handling
                continue

            vid1_key, vid2_key, score = parts

            try:
                score = float(score)
            except ValueError:
                # Some of the values were invalid - deal with it.
                # TODO(benkomalo): error handling.
                continue

            add_score(video_infos, vid1_key, vid2_key, score)

            if lines_read % 1000 == 0:
                print "Read %s lines..." % lines_read

    # Convert version datestamp to a more sane ISO8601 from RFC822
    version = rfc822.parsedate_tz(version)[:6]  # extract YMDHMS from tuple
    version = datetime.datetime(*version).isoformat()

    total_pairs = sum([len(info.best_matches)
                       for info in video_infos.values()])
    print "\nSummary of collected data:"
    print "\tScore type: [%s]" % score_type
    print "\tVersion: [%s]" % version
    print ("\tDetected %d videos, with a total of %d video pair data" %
           (len(video_infos), total_pairs))
    print "Target: %s" % oauth_util.consts.SERVER_URL
    if raw_input("Proceed to upload? [Y/n]: ").lower() in ['', 'y', 'yes']:
        upload_to_gae(score_type, version, video_infos)
        print "Success!"
        print "Run the following to make it live:"
        print "set_video_matrix_version.py '%s' '%s'" % (
                score_type, version)
Esempio n. 8
0
def main(table_location, target_db, target_collection, column_info,
         partition_cols, options):

    boto_util.initialize_creds_from_file()

    # TODO(benkomalo): handle other formats? It may also be compressed, so
    #    we may have to do more work.
    # Default row format for the tables are delimited by this control char.
    # when stored on disk.
    delimiter = '\01'
    key_index = options.key_index
    num_cols = len(column_info)
    col_names, col_types, _ = zip(*column_info)

    if key_index >= num_cols:
        raise Exception("Invalid key index (there aren't enough columns)")

    base_partition_values = {}
    for partition in partition_cols:
        key, value = partition.split('=')
        base_partition_values[key] = value
    table_location += _format_path_from_partition(partition_cols)

    # Open our target db connection
    mongo_conn = pymongo.Connection(options.report_db_host,
                                    port=options.report_db_port)
    mongodb = mongo_conn[target_db]
    mongo_collection = mongodb[target_collection]

    # If drop flag was set, nuke any pre-existing data
    if options.drop:
        print "\nDropping existing data in collection %s." % target_collection
        # Even though the option is called 'drop', I am instead
        # calling 'remove', because it leaves any indexes while deleting data.
        mongo_collection.remove()
    elif options.drop_partition:
        print(
            "\nDropping existing data in collection {0} "
            "on partition spec {1}".format(target_collection, partition_cols))

        mongo_collection.remove(base_partition_values)

    # Open our input connections.
    s3conn = boto.connect_s3()
    bucket = s3conn.get_bucket('ka-mapreduce')
    path_prefix = table_location[len('s3://ka-mapreduce/'):] + '/'
    s3keys = bucket.list(prefix=path_prefix)

    # TODO(benkomalo): add a flag to bail on any errors so no partial data is
    #    saved?
    # Counts of rows saved, and rows with errors.
    saved = 0
    errors = 0
    NULL_STRING = '\N'

    docs_to_insert = []
    batch_size = max(1, options.batch_size)

    lines_parsed = 0

    # Note: a table's data may be broken down into multiple files on disk.
    for key in s3keys:
        if key.name.endswith('_$folder$'):
            # S3 meta data - not useful.
            continue

        # Find out if files are under additional partition "directories"
        # e.g. "start_dt=2012-01-01/end_dt=2012-02-01/some_file"
        base_path = key.name[len(path_prefix):]
        path_parts = base_path.split('/')
        partition_values = base_partition_values.copy()
        for partition in path_parts[:-1]:
            name, value = partition.split('=')
            partition_values[name] = value

        contents = key.get_contents_as_string()
        for line in contents.split('\n'):

            if not line:
                # EOF
                break

            # HACK: some files are '\t' delimited? Switch delimiters.
            # TODO(benkomalo): read format from metadata in Hive master?
            if delimiter not in line:
                delimiter = '\t'

            parts = line.strip().split(delimiter)
            if len(parts) != num_cols:
                # TODO(benkomalo): properly handle? shouldn't happen though.
                sys.stderr.write(
                    "Unexpected number of columns in row (expected [%s]):\n" %
                    num_cols)
                print >> sys.stderr, parts
                continue

            doc = {}
            for i, (name, type) in enumerate(zip(col_names, col_types)):
                # TODO(benkomalo): deal with other types and possible UTF-8
                #    issues?
                try:
                    if type == 'int':
                        value = int(parts[i])
                    elif type == 'boolean':
                        value = parts[i] == 'true'
                    elif type in ['double', 'float']:
                        value = float(parts[i])
                    else:
                        value = parts[i]
                except ValueError:
                    if parts[i] == NULL_STRING:
                        # Some queries (especially OUTER JOINS) can result in
                        #  null values in non string type fields.
                        # In case the person who wrote the query does not
                        #  replace resultant NULLs with default values
                        #  the importer should still be able to copy the table
                        value = None
                    else:
                        doc = None
                        break

                doc[name] = value

            if key_index > -1 and doc:
                # mongo primary keys are labelled as "_id"
                doc['_id'] = parts[key_index]

            if doc:

                doc.update(partition_values)

                # If we have doc IDs, perform a single upsert because bulk
                # upserts are not currently supported by pymongo according to
                # this old answer: http://stackoverflow.com/questions/5292370
                if '_id' in doc:
                    mongo_collection.save(doc)
                    saved += 1
                else:
                    docs_to_insert.append(doc)

            else:
                errors += 1

            # Bulk insert docs in batches
            if len(docs_to_insert) >= batch_size:
                mongo_collection.insert(docs_to_insert)
                saved += len(docs_to_insert)
                docs_to_insert = []

            if lines_parsed % 100 == 0:
                print "\rSaved %s docs with %s errors..." % (saved, errors),
                sys.stdout.flush()

            lines_parsed += 1

    if docs_to_insert:
        mongo_collection.insert(docs_to_insert)
        saved += len(docs_to_insert)

    print "\nSummary of results:"
    print "\tSaved [%s] documents" % saved
    print "\tSkipped [%s] documents with errors" % errors
Esempio n. 9
0
    "python deploy.py geoip"
For even more information about GeoIP refer to Makefile in this directory
"""

import optparse
import os
import subprocess
import sys

import boto

sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
import boto_util
import notify

boto_util.initialize_creds_from_file()


def popen_results(args):
    proc = subprocess.Popen(args, stdout=subprocess.PIPE)
    return proc.communicate()[0]


def parse_git_version():
    return popen_results(['git', 'rev-parse', 'HEAD']).strip()


def parse_git_message():
    return popen_results(['git', 'show', '-s', '--pretty=format:%s']).strip()