Example #1
0
 def load_dictionary(self, path):
     """
     Load the dictionary from hdfs path.
     @param path
     """
     print "Loading dictionary from: %s" % path
     self.dictionary = []
     for line in HDFSUtil.read_lines(path, hadoop_prefix=HADOOP_PREFIX):
         self.dictionary.append(line.split(" ")[0])
     print "Succeeded in loading dictionary."
Example #2
0
 def load_dictionary(self, path):
     """
     Load the dictionary from hdfs path.
     @param path
     """
     print "Loading dictionary from: %s" % path
     self.dictionary = []
     for line in HDFSUtil.read_lines(path, hadoop_prefix=HADOOP_PREFIX):
         self.dictionary.append(line.split(" ")[0])
     print "Succeeded in loading dictionary."
Example #3
0
 def load_distance(self, path, initial=False):
     """
     Load the distance file from hdfs path.
     @param path
     """
     distances = []
     for line in HDFSUtil.read_lines(path, hadoop_prefix=HADOOP_PREFIX):
         if not initial:
             if line.startswith("distance"):
                 line = line.split("\t", 1)[1]
             else:
                 continue
         line = line.rstrip()
         distances.append(line)
     return distances
Example #4
0
 def load_distance(self, path, initial=False):
     """
     Load the distance file from hdfs path.
     @param path
     """
     distances = []
     for line in HDFSUtil.read_lines(path, hadoop_prefix=HADOOP_PREFIX):
         if not initial:
             if line.startswith("distance"):
                 line = line.split("\t", 1)[1]
             else:
                 continue
         line = line.rstrip()
         distances.append(line)
     return distances
Example #5
0
 def load_cluster(self, path, initial=False):
     """
     Load the cluster centers from hdfs path.
     @param path
     """
     clusters = []
     for line in HDFSUtil.read_lines(path, hadoop_prefix=HADOOP_PREFIX):
         if not initial:
             if line.startswith("cluster"):
                 line = line.split("\t", 1)[1]
             else:
                 continue
         c = Cluster()
         c.read(line)
         clusters.append(c)
     return clusters
Example #6
0
 def load_cluster(self, path, initial=False):
     """
     Load the cluster centers from hdfs path.
     @param path
     """
     clusters = []
     for line in HDFSUtil.read_lines(path, hadoop_prefix=HADOOP_PREFIX):
         if not initial:
             if line.startswith("cluster"):
                 line = line.split("\t", 1)[1]
             else:
                 continue
         c = Cluster()
         c.read(line)
         clusters.append(c)
     return clusters
 def main(self):
     if self.iteration == 1:
         path = self.kmeans_hdfs_path + "/cluster0/cluster0.txt"
     else:
         path = self.kmeans_hdfs_path + "/output/cluster" + str(self.iteration - 1) + "/part-00000"
     for line in HDFSUtil.read_lines(path, hadoop_prefix=self.hadoop_prefix):
         if self.iteration > 1:
             if line.startswith("cluster"):
                 line = line.split("\t", 1)[1]
             else:
                 continue
         c = Cluster()
         c.read(line)
         self.clusters.append(c)
     data = self.read_input(sys.stdin)
     for line in data:
         self.map(line)
Example #8
0
 def main(self):
     if self.iteration == 1:
         path = self.kmeans_hdfs_path + "/cluster0/cluster0.txt"
     else:
         path = self.kmeans_hdfs_path + "/output/cluster" + str(self.iteration - 1) + "/part-00000"
     for line in HDFSUtil.read_lines(path, hadoop_prefix=self.hadoop_prefix):
         if self.iteration > 1:
             if line.startswith("cluster"):
                 line = line.split("\t", 1)[1]
             else:
                 continue
         c = Cluster()
         c.read(line)
         self.clusters.append(c)
     data = self.read_input(sys.stdin)
     for line in data:
         self.map(line)