def train(location='./train/'): """ The training procedure is triggered here. OPTIONAL to run; everything that is required for testing the model must be saved to file (e.g., pickle) so that the test procedure can load, execute and report :param location: The location of the training data folder hierarchy :return: nothing """ # run_vgg(location) labels_embedding = run_word_preprocessing() model.train_model(labels_embedding, location) trained_image_filenames, trained_image_vectors = get_all_trained_image_vectors( ) create_cluster(trained_image_vectors, "preprocessing/image_vector_cluster.pickle")
def main(): parser = argparse.ArgumentParser(prog='ephemeral-spark-submit.py') parser.add_argument( '--admin-username', default="admin", help= 'Name of an user with administrative access (defaults to %(default)s)') parser.add_argument( '--admin-password', default="admin", help='Password for the administrative user (defaults to %(default)s)') parser.add_argument( '--server', default="http://localhost:7189", help="Cloudera Director server URL (defaults to %(default)s)") parser.add_argument( '--cm', help="The name of the Cloudera Manager server to use in Director") parser.add_argument('--environment', help="The name of the Environment to use in Director") parser.add_argument( '--jar', help="JAR for Spark job you want to run on ephemeral cluster") parser.add_argument('--jarclass', help="The --class flag for spark-submit") parser.add_argument('--args', help="The arguments for the jar") parser.add_argument('--script', help="Script that runs before spark job") parser.add_argument('config_file', help="Cluster configuration file (.ini)") args = parser.parse_args() if not isfile(args.config_file): print 'Error: "%s" not found or not a file' % args.config_file return -1 config = ConfigParser.SafeConfigParser() config.read(args.config_file) #Create authenticated client client = cluster.get_authenticated_client(args) #Execute cluster creation cluster_name = cluster.create_cluster(client, args.environment, args.cm, config) print 'Waiting for the cluster to be ready. Check the web interface for details.' cluster.wait_for_cluster(client, args.environment, args.cm, cluster_name) client = ApiClient(args.server) AuthenticationApi(client).login( Login(username=args.admin_username, password=args.admin_password)) clusters = ClustersApi(client) eph_cluster = clusters.get(args.environment, args.cm, cluster_name) instances = eph_cluster.instances #Find which is a gateway node for instance in instances: if str(instance.virtualInstance.template.name) == 'gateway': gateway = instance gateway = gateway.properties['publicDnsName'] print("The Gateway url is: " + gateway) #Copy the JAR and postscript to the GW copy_jar(args.jar, gateway, config) #Copy script to the GW copy_script(args.script, gateway, config) #Create directory in HDFS with correct permissions configure_hdfs(gateway, config) #Execute the job execute_spark(args.jar, args.jarclass, args.args, gateway, config) #Run some post script execute_script(args.script, gateway, config) #Destroy the cluster print "Job complete, terminating the instance" clusters.delete(args.environment, args.cm, cluster_name) return 0
def main(): parser = argparse.ArgumentParser(prog="ephemeral-spark-submit.py") parser.add_argument( "--admin-username", default="admin", help="Name of an user with administrative access (defaults to %(default)s)" ) parser.add_argument( "--admin-password", default="admin", help="Password for the administrative user (defaults to %(default)s)" ) parser.add_argument( "--server", default="http://localhost:7189", help="Cloudera Director server URL (defaults to %(default)s)" ) parser.add_argument("--cm", help="The name of the Cloudera Manager server to use in Director") parser.add_argument("--environment", help="The name of the Environment to use in Director") parser.add_argument("--jar", help="JAR for Spark job you want to run on ephemeral cluster") parser.add_argument("--jarclass", help="The --class flag for spark-submit") parser.add_argument("--args", help="The arguments for the jar") parser.add_argument("--script", help="Script that runs before spark job") parser.add_argument("config_file", help="Cluster configuration file (.ini)") args = parser.parse_args() if not isfile(args.config_file): print 'Error: "%s" not found or not a file' % args.config_file return -1 config = ConfigParser.SafeConfigParser() config.read(args.config_file) # Create authenticated client client = cluster.get_authenticated_client(args) # Execute cluster creation cluster_name = cluster.create_cluster(client, args.environment, args.cm, config) print "Waiting for the cluster to be ready. Check the web interface for details." cluster.wait_for_cluster(client, args.environment, args.cm, cluster_name) client = ApiClient(args.server) AuthenticationApi(client).login(Login(username=args.admin_username, password=args.admin_password)) clusters = ClustersApi(client) eph_cluster = clusters.get(args.environment, args.cm, cluster_name) instances = eph_cluster.instances # Find which is a gateway node for instance in instances: if str(instance.virtualInstance.template.name) == "gateway": gateway = instance gateway = gateway.properties["publicDnsName"] print ("The Gateway url is: " + gateway) # Copy the JAR and postscript to the GW copy_jar(args.jar, gateway, config) # Copy script to the GW copy_script(args.script, gateway, config) # Create directory in HDFS with correct permissions configure_hdfs(gateway, config) # Execute the job execute_spark(args.jar, args.jarclass, args.args, gateway, config) # Run some post script execute_script(args.script, gateway, config) # Destroy the cluster print "Job complete, terminating the instance" clusters.delete(args.environment, args.cm, cluster_name) return 0
return dm if __name__ == "__main__": dir_name = sys.argv[1] in_dir = os.path.join(dir_name, 'in') gt_dir = os.path.join(dir_name, 'gt') image_files = os.listdir(in_dir) print("loading images") images = load_images(image_files, in_dir) sm = cluster.build_similarity_matrix(in_dir, image_files) dm = build_distance_matrix(in_dir, images) c = cluster.create_cluster(sm) print("Building RNG") count = 0 for i in images: G.add_node(count) G.node[count]['im'] = i G.node[count]['cluster'] = c.labels_[count] count += 1 for n1 in G: print("Finding neighbours for node %d/%d" % (n1 + 1, len(G)), end='\r') for n2 in G: if n1 < n2 and neighbors(dm, n1, n2): G.add_edge(n1, n2)
tf_matrix = getWordVec(clean_abstract) print "Vectors are ready.." print "First Vector", tf_matrix.todense()[0] print "Second Vector", tf_matrix.todense()[0] clean_stop = [] for text in clean_abstract: text = ' '.join([ word.lower() for word in text.split() if word.lower() not in stopWords ]) text = text.decode('unicode_escape').encode('ascii', 'ignore') clean_stop.append(text) tf_mat_stop = getWordVec(clean_stop) kmeans1_stop = create_cluster(sparse_data=tf_mat_stop, nclust=6) trans_mat_stop = kmeans1_stop.transform(tf_mat_stop) clust_dict_stop = {} for i, label in enumerate(kmeans1_stop.labels_): if label in clust_dict_stop: clust_dict_stop[label].append(clean_stop[i]) else: clust_dict_stop[label] = [] clust_dict_stop[label].append(clean_stop[i]) print "cluster_dict created" keywords = {} for key in clust_dict_stop: word_dict = {} for abstract in clust_dict_stop[key]: