Esempio n. 1
0
def train(location='./train/'):
    """
    The training procedure is triggered here. OPTIONAL to run; everything that is required for testing the model
    must be saved to file (e.g., pickle) so that the test procedure can load, execute and report
    :param location: The location of the training data folder hierarchy
    :return: nothing
    """

    # run_vgg(location)
    labels_embedding = run_word_preprocessing()
    model.train_model(labels_embedding, location)
    trained_image_filenames, trained_image_vectors = get_all_trained_image_vectors(
    )

    create_cluster(trained_image_vectors,
                   "preprocessing/image_vector_cluster.pickle")
def main():
    parser = argparse.ArgumentParser(prog='ephemeral-spark-submit.py')
    parser.add_argument(
        '--admin-username',
        default="admin",
        help=
        'Name of an user with administrative access (defaults to %(default)s)')
    parser.add_argument(
        '--admin-password',
        default="admin",
        help='Password for the administrative user (defaults to %(default)s)')
    parser.add_argument(
        '--server',
        default="http://localhost:7189",
        help="Cloudera Director server URL (defaults to %(default)s)")
    parser.add_argument(
        '--cm',
        help="The name of the Cloudera Manager server to use in Director")
    parser.add_argument('--environment',
                        help="The name of the Environment to use in Director")
    parser.add_argument(
        '--jar', help="JAR for Spark job you want to run on ephemeral cluster")
    parser.add_argument('--jarclass', help="The --class flag for spark-submit")
    parser.add_argument('--args', help="The arguments for the jar")
    parser.add_argument('--script', help="Script that runs before spark job")
    parser.add_argument('config_file',
                        help="Cluster configuration file (.ini)")
    args = parser.parse_args()

    if not isfile(args.config_file):
        print 'Error: "%s" not found or not a file' % args.config_file
        return -1

    config = ConfigParser.SafeConfigParser()
    config.read(args.config_file)

    #Create authenticated client
    client = cluster.get_authenticated_client(args)

    #Execute cluster creation
    cluster_name = cluster.create_cluster(client, args.environment, args.cm,
                                          config)
    print 'Waiting for the cluster to be ready. Check the web interface for details.'
    cluster.wait_for_cluster(client, args.environment, args.cm, cluster_name)
    client = ApiClient(args.server)
    AuthenticationApi(client).login(
        Login(username=args.admin_username, password=args.admin_password))
    clusters = ClustersApi(client)
    eph_cluster = clusters.get(args.environment, args.cm, cluster_name)
    instances = eph_cluster.instances
    #Find which is a gateway node
    for instance in instances:
        if str(instance.virtualInstance.template.name) == 'gateway':
            gateway = instance
    gateway = gateway.properties['publicDnsName']
    print("The Gateway url is: " + gateway)

    #Copy the JAR and postscript to the GW
    copy_jar(args.jar, gateway, config)
    #Copy script to the GW
    copy_script(args.script, gateway, config)
    #Create directory in HDFS with correct permissions
    configure_hdfs(gateway, config)
    #Execute the job
    execute_spark(args.jar, args.jarclass, args.args, gateway, config)
    #Run some post script
    execute_script(args.script, gateway, config)
    #Destroy the cluster
    print "Job complete, terminating the instance"
    clusters.delete(args.environment, args.cm, cluster_name)

    return 0
def main():
    parser = argparse.ArgumentParser(prog="ephemeral-spark-submit.py")
    parser.add_argument(
        "--admin-username", default="admin", help="Name of an user with administrative access (defaults to %(default)s)"
    )
    parser.add_argument(
        "--admin-password", default="admin", help="Password for the administrative user (defaults to %(default)s)"
    )
    parser.add_argument(
        "--server", default="http://localhost:7189", help="Cloudera Director server URL (defaults to %(default)s)"
    )
    parser.add_argument("--cm", help="The name of the Cloudera Manager server to use in Director")
    parser.add_argument("--environment", help="The name of the Environment to use in Director")
    parser.add_argument("--jar", help="JAR for Spark job you want to run on ephemeral cluster")
    parser.add_argument("--jarclass", help="The --class flag for spark-submit")
    parser.add_argument("--args", help="The arguments for the jar")
    parser.add_argument("--script", help="Script that runs before spark job")
    parser.add_argument("config_file", help="Cluster configuration file (.ini)")
    args = parser.parse_args()

    if not isfile(args.config_file):
        print 'Error: "%s" not found or not a file' % args.config_file
        return -1

    config = ConfigParser.SafeConfigParser()
    config.read(args.config_file)

    # Create authenticated client
    client = cluster.get_authenticated_client(args)

    # Execute cluster creation
    cluster_name = cluster.create_cluster(client, args.environment, args.cm, config)
    print "Waiting for the cluster to be ready. Check the web interface for details."
    cluster.wait_for_cluster(client, args.environment, args.cm, cluster_name)
    client = ApiClient(args.server)
    AuthenticationApi(client).login(Login(username=args.admin_username, password=args.admin_password))
    clusters = ClustersApi(client)
    eph_cluster = clusters.get(args.environment, args.cm, cluster_name)
    instances = eph_cluster.instances
    # Find which is a gateway node
    for instance in instances:
        if str(instance.virtualInstance.template.name) == "gateway":
            gateway = instance
    gateway = gateway.properties["publicDnsName"]
    print ("The Gateway url is: " + gateway)

    # Copy the JAR and postscript to the GW
    copy_jar(args.jar, gateway, config)
    # Copy script to the GW
    copy_script(args.script, gateway, config)
    # Create directory in HDFS with correct permissions
    configure_hdfs(gateway, config)
    # Execute the job
    execute_spark(args.jar, args.jarclass, args.args, gateway, config)
    # Run some post script
    execute_script(args.script, gateway, config)
    # Destroy the cluster
    print "Job complete, terminating the instance"
    clusters.delete(args.environment, args.cm, cluster_name)

    return 0
    return dm


if __name__ == "__main__":
    dir_name = sys.argv[1]
    in_dir = os.path.join(dir_name, 'in')
    gt_dir = os.path.join(dir_name, 'gt')

    image_files = os.listdir(in_dir)

    print("loading images")
    images = load_images(image_files, in_dir)

    sm = cluster.build_similarity_matrix(in_dir, image_files)
    dm = build_distance_matrix(in_dir, images)
    c = cluster.create_cluster(sm)

    print("Building RNG")
    count = 0
    for i in images:
        G.add_node(count)
        G.node[count]['im'] = i
        G.node[count]['cluster'] = c.labels_[count]
        count += 1

    for n1 in G:
        print("Finding neighbours for node %d/%d" % (n1 + 1, len(G)), end='\r')
        for n2 in G:
            if n1 < n2 and neighbors(dm, n1, n2):
                G.add_edge(n1, n2)
Esempio n. 5
0
    tf_matrix = getWordVec(clean_abstract)
    print "Vectors are ready.."
    print "First Vector", tf_matrix.todense()[0]
    print "Second Vector", tf_matrix.todense()[0]

    clean_stop = []
    for text in clean_abstract:
        text = ' '.join([
            word.lower() for word in text.split()
            if word.lower() not in stopWords
        ])
        text = text.decode('unicode_escape').encode('ascii', 'ignore')
        clean_stop.append(text)
        tf_mat_stop = getWordVec(clean_stop)

    kmeans1_stop = create_cluster(sparse_data=tf_mat_stop, nclust=6)
    trans_mat_stop = kmeans1_stop.transform(tf_mat_stop)

    clust_dict_stop = {}
    for i, label in enumerate(kmeans1_stop.labels_):
        if label in clust_dict_stop:
            clust_dict_stop[label].append(clean_stop[i])
        else:
            clust_dict_stop[label] = []
            clust_dict_stop[label].append(clean_stop[i])
    print "cluster_dict created"

    keywords = {}
    for key in clust_dict_stop:
        word_dict = {}
        for abstract in clust_dict_stop[key]: