Esempio n. 1
0
def train(training_data):
  """
  Trains a model, using bernoulli features
  """
  if ARGV.features ==  "bernoulli":
    features, featureMap, labels, labelMap = fs.bernoulli(training_data)
  else:
    features, scores, featureMap, labels, labelMap = fs.mutualinfo(training_data)
  learner = models[ ARGV.model ]()
  if ARGV.one_vs:
    labels[ labels != labelMap[ ARGV.one_vs ] ] = 0
    labels[ labels == labelMap[ ARGV.one_vs ] ] = 1
  model = learner.train(features, labels)
  if ARGV.features ==  "bernoulli":
    return (model, featureMap, labelMap)
  else:
    return ((model,scores), featureMap, labelMap)
Esempio n. 2
0
def train(training_data):
    """
  Trains a model, using bernoulli features
  """
    if ARGV.features == "bernoulli":
        features, featureMap, labels, labelMap = fs.bernoulli(training_data)
    else:
        features, scores, featureMap, labels, labelMap = fs.mutualinfo(
            training_data)
    learner = models[ARGV.model]()
    if ARGV.one_vs:
        labels[labels != labelMap[ARGV.one_vs]] = 0
        labels[labels == labelMap[ARGV.one_vs]] = 1
    model = learner.train(features, labels)
    if ARGV.features == "bernoulli":
        return (model, featureMap, labelMap)
    else:
        return ((model, scores), featureMap, labelMap)
Esempio n. 3
0
def kmeans_summary():
    print "---* KMeans clustering *---"
    data = DataReader(ARGV.data)
    features, featureMap, labels, labelMap = fs.bernoulli(data)
    # run kmeans
    k = len(labelMap)
    # pca_features, components = milk.unsupervised.pca(features)
    reduced_features = features
    cluster_ids, centroids = milk.unsupervised.repeated_kmeans(
        reduced_features, k, 3)
    # start outputing
    out_folder = "clusters"
    if not path.exists(out_folder):
        os.mkdir(out_folder)
    print "---* Results *---"
    # plot
    if ARGV.plot:
        import matplotlib.pyplot as plt
        colors = "bgrcbgrc"
        marks = "xxxxoooo"
        xmin = np.min(pca_features[:, 1])
        xmax = np.max(pca_features[:, 1])
        ymin = np.min(pca_features[:, 2])
        ymax = np.max(pca_features[:, 2])
        print[xmin, xmax, ymin, ymax]
        plt.axis([xmin, xmax, ymin, ymax])
    # printing
    for i in xrange(k):
        if not ARGV.no_print:
            out_file = path.join(out_folder, "cluster_{}".format(i))
            print "Writing to: {}".format(out_file)
            with open(out_file, 'w') as out:
                for j, tweetinfo in enumerate(data):
                    if cluster_ids[j] == i:
                        out.write(str(tweetinfo["Tweet"]) + "\n")
        if ARGV.plot:
            plt.plot(pca_features[cluster_ids == i, 1], pca_features[cluster_ids == i, 2], \
              colors[i] + marks[i])
    if ARGV.plot:
        print "Writing to: {}".format(path.join(out_folder, "plot.png"))
        plt.savefig(path.join(out_folder, "plot.png"))
Esempio n. 4
0
def kmeans_summary():
  print "---* KMeans clustering *---"
  data = DataReader(ARGV.data)
  features, featureMap, labels, labelMap = fs.bernoulli(data)
  # run kmeans
  k = len(labelMap)
  # pca_features, components = milk.unsupervised.pca(features)
  reduced_features = features
  cluster_ids, centroids = milk.unsupervised.repeated_kmeans(reduced_features, k, 3)
  # start outputing
  out_folder = "clusters"
  if not path.exists(out_folder):
    os.mkdir(out_folder)
  print "---* Results *---"
  # plot
  if ARGV.plot:
    import matplotlib.pyplot as plt
    colors = "bgrcbgrc"
    marks = "xxxxoooo"
    xmin = np.min(pca_features[:, 1])
    xmax = np.max(pca_features[:, 1])
    ymin = np.min(pca_features[:, 2])
    ymax = np.max(pca_features[:, 2])
    print [ xmin, xmax, ymin, ymax ]
    plt.axis([ xmin, xmax, ymin, ymax ])
  # printing
  for i in xrange(k):
    if not ARGV.no_print:
      out_file = path.join(out_folder, "cluster_{}".format(i))
      print "Writing to: {}".format(out_file)
      with open(out_file, 'w') as out:
        for j, tweetinfo in enumerate(data):
          if cluster_ids[j] == i:
            out.write(str(tweetinfo["Tweet"]) + "\n")
    if ARGV.plot:
      plt.plot(pca_features[cluster_ids == i, 1], pca_features[cluster_ids == i, 2], \
        colors[i] + marks[i])
  if ARGV.plot:
    print "Writing to: {}".format(path.join(out_folder, "plot.png"))
    plt.savefig(path.join(out_folder, "plot.png"))