Esempio n. 1
0
 def extract_training_data(self):
     obj = input_from_xml()
     domains,questions = obj.fetch_input_from_xml_questions()
     train_target_names = domains
     train_data = []
     train_target = []
     for i in range(0,len(domains)):
       for quest in questions[domains[i]]:
         train_data.append(quest)
         train_target.append(i)
     return train_data,train_target,train_target_names
Esempio n. 2
0
	def data_from_svm(self):
		questions,predict,classes,scores, domains = main_function()
		context = zmq.Context()
		socket = context.socket(zmq.REQ)
		socket.connect("tcp://127.0.0.1:5000")
		
		for i in range(0,len(questions)):
			quest = questions[i]
			train_questions_indices = []
			fname = open("kernel_trees.txt","r")
			max_score = -4.0
			for score in scores[i]:
				max_score = max(score,max_score)	
			if max_score >= threshold_score:
				print "iam here in one"
				domain = domains[predict[i]]
				temp = input_from_xml()
				dmns,quests = temp.fetch_input_from_xml_questions()
				
				cnt = 1
				indices = []
				for k,v in quests.items():
					temp_questions = quests[k]
					for i in range(0,len(temp_questions)):
						if(k == domain):
							indices.append(cnt)
						cnt = cnt + 1

				for i in range(0,len(indices)):
					train_questions_indices.append(indices[i])

			else:
				print "iam here in two"
				line_cnt = 1
				for line in fname.readlines():
					train_questions_indices.append(line_cnt)
					line_cnt = line_cnt + 1

			temp2 = convert_training_data()

			parse_quest = temp2.convert_to_parse_tree(quest)
			print len(parse_quest)
			msg = str(parse_quest)

			msg2 = ""
			for i in range(0,len(train_questions_indices)):
				msg2 = msg2 + str(train_questions_indices[i])
				msg2 = msg2 + "$"
			msg2 = str(msg2)

			main_msg = msg + "\n" + msg2
			socket.send(main_msg)
			time.sleep(1000)
Esempio n. 3
0
    def extract_testing_data(self):
      obj = input_from_xml()
      questions = obj.fetch_sms_queries()
      
      test_data = []
      test_target = []
      test_target_names = []

      for domain,quest,indomain in questions:
        if domain not in test_target_names:
          test_target_names.append(domain)

      for domain,quest,indomain in questions:
        if domain:
            test_target.append(test_target_names.index(domain))
            test_data.append(quest)
        else:
            test_target.append(-1)
            test_data.append(quest)

      return test_data,test_target
Esempio n. 4
0
def main_function():
  # Display progress logs on stdout
  logging.basicConfig(level=logging.INFO,
                      format='%(asctime)s %(levelname)s %(message)s')


  # parse commandline arguments
  op = OptionParser()
  op.add_option("--report",
                action="store_true", dest="print_report",
                help="Print a detailed classification report.")
  op.add_option("--chi2_select",
                action="store", type="int", dest="select_chi2",
                help="Select some number of features using a chi-squared test")
  op.add_option("--confusion_matrix",
                action="store_true", dest="print_cm",
                help="Print the confusion matrix.")
  op.add_option("--top10",
                action="store_true", dest="print_top10",
                help="Print ten most discriminative terms per class"
                     " for every classifier.")
  op.add_option("--all_categories",
                action="store_true", dest="all_categories",
                help="Whether to use all categories or not.")
  op.add_option("--use_hashing",
                action="store_true",
                help="Use a hashing vectorizer.")
  op.add_option("--n_features",
                action="store", type=int, default=2 ** 16,
                help="n_features when using the hashing vectorizer.")
  op.add_option("--filtered",
                action="store_true",
                help="Remove newsgroup information that is easily overfit: "
                     "headers, signatures, and quoting.")
  (opts, args) = op.parse_args()
  if len(args) > 0:
      op.error("this script takes no arguments.")
      sys.exit(1)

  print(__doc__)
  op.print_help()
  print()


  ###############################################################################
  # Load some categories from the training set
  if opts.all_categories:
      categories = None
  else:
      obj = input_from_xml()
      domains,questions = obj.fetch_input_from_xml_questions()
      categories = domains

  print("Loading Data from different categories....")
  print(categories if categories else "all")


  obj = extract_data()
  train_data,train_target,train_target_names = obj.extract_training_data()
  test_data,test_target = obj.extract_testing_data()


  print('data loaded')
#  print train_target_names

  def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

  data_train_size_mb = size_mb(train_data)
  data_test_size_mb = size_mb(test_data)

  print("%d documents - %0.3fMB (training set)" % (
     len(train_data), data_train_size_mb))
  print("%d documents - %0.3fMB (test set)" % (
     len(test_data), data_test_size_mb))
  print("%d categories" % len(categories))
  print()


  categories = train_target_names 
  y_train,y_test = train_target,test_target

  print("Extracting features from the training dataset using a sparse vectorizer")
  t0 = time()
  if opts.use_hashing:
      vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                     n_features=opts.n_features)
      X_train = vectorizer.transform(train_data)
  else:
      vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                   stop_words='english')
      X_train = vectorizer.fit_transform(train_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_train.shape)
  print()

  print("Extracting features from the test dataset using the same vectorizer")
  t0 = time()
  X_test = vectorizer.transform(test_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_test.shape)
  print()

  if opts.select_chi2:
      print("Extracting %d best features by a chi-squared test" %
            opts.select_chi2)
      t0 = time()
      ch2 = SelectKBest(chi2, k=opts.select_chi2)
      X_train = ch2.fit_transform(X_train, y_train)
      X_test = ch2.transform(X_test)
      print("done in %fs" % (time() - t0))
      print()

  def trim(s):
      """Trim string to fit on terminal (assuming 80-column display)"""
      return s if len(s) <= 80 else s[:77] + "..."

  if opts.use_hashing:
      feature_names = None
  else:
      feature_names = np.asarray(vectorizer.get_feature_names())

  loop = 1
  C= 1.0
  while True:
    print('_' * 80)
    print("Training the data: ")
    t0 = time()
    clf = LinearSVC(C=C)
    clf.fit(X_train,y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
#    return test_data, pred, y_test, scores, train_target_names
    train_data = train_data + test_data
    train_target = train_target + test_target
    X_train = vectorizer.transform(train_data)
    y_train = train_target

    cnt = 0
    cnt2 = 0
    for i in range(0,len(y_test)):
      if y_test[i]!=-1:
        if pred[i] == y_test[i]:
          cnt = cnt + 1
        cnt2 = cnt2 + 1

    print cnt,cnt2
    print "efficiency of classification in loop %d is: %f" %(loop,(cnt + len(y_test) - cnt2)/(1.0*len(y_test))*100)


    '''
    for i in range(0,10):
        print "quest: ",test_data[i]
        print "Given class: %s" %(train_target_names[y_test[i]])
        print "Predicted class: %s" %(train_target_names[pred[i]]) 
        print "#####################"
    '''
    loop = loop + 1
Esempio n. 5
0
def main_function():
  # Display progress logs on stdout
  logging.basicConfig(level=logging.INFO,
                      format='%(asctime)s %(levelname)s %(message)s')


  # parse commandline arguments
  op = OptionParser()
  op.add_option("--report",
                action="store_true", dest="print_report",
                help="Print a detailed classification report.")
  op.add_option("--chi2_select",
                action="store", type="int", dest="select_chi2",
                help="Select some number of features using a chi-squared test")
  op.add_option("--confusion_matrix",
                action="store_true", dest="print_cm",
                help="Print the confusion matrix.")
  op.add_option("--top10",
                action="store_true", dest="print_top10",
                help="Print ten most discriminative terms per class"
                     " for every classifier.")
  op.add_option("--all_categories",
                action="store_true", dest="all_categories",
                help="Whether to use all categories or not.")
  op.add_option("--use_hashing",
                action="store_true",
                help="Use a hashing vectorizer.")
  op.add_option("--n_features",
                action="store", type=int, default=2 ** 16,
                help="n_features when using the hashing vectorizer.")
  op.add_option("--filtered",
                action="store_true",
                help="Remove newsgroup information that is easily overfit: "
                     "headers, signatures, and quoting.")
  (opts, args) = op.parse_args()
  if len(args) > 0:
      op.error("this script takes no arguments.")
      sys.exit(1)

  print(__doc__)
  op.print_help()
  print()


  ###############################################################################
  # Load some categories from the training set
  if opts.all_categories:
      categories = None
  else:
      obj = input_from_xml()
      domains,questions = obj.fetch_input_from_xml_questions()
      categories = domains

  print("Loading Data from different categories....")
  print(categories if categories else "all")


  obj = extract_data()
  train_data,train_target,train_target_names = obj.extract_training_data()
  test_data,test_target = obj.extract_testing_data()


  print('data loaded')
#  print train_target_names

  def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

  data_train_size_mb = size_mb(train_data)
  data_test_size_mb = size_mb(test_data)

  print("%d documents - %0.3fMB (training set)" % (
     len(train_data), data_train_size_mb))
  print("%d documents - %0.3fMB (test set)" % (
     len(test_data), data_test_size_mb))
  print("%d categories" % len(categories))
  print()


  categories = train_target_names 
  y_train,y_test = train_target,test_target

  print("Extracting features from the training dataset using a sparse vectorizer")
  t0 = time()
  if opts.use_hashing:
      vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                     n_features=opts.n_features)
      X_train = vectorizer.transform(train_data)
  else:
      vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                   stop_words='english')
      X_train = vectorizer.fit_transform(train_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_train.shape)
  print()

  print("Extracting features from the test dataset using the same vectorizer")
  t0 = time()
  X_test = vectorizer.transform(test_data)
  duration = time() - t0
  print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
  print("n_samples: %d, n_features: %d" % X_test.shape)
  print()

  if opts.select_chi2:
      print("Extracting %d best features by a chi-squared test" %
            opts.select_chi2)
      t0 = time()
      ch2 = SelectKBest(chi2, k=opts.select_chi2)
      X_train = ch2.fit_transform(X_train, y_train)
      X_test = ch2.transform(X_test)
      print("done in %fs" % (time() - t0))
      print()

  def trim(s):
      """Trim string to fit on terminal (assuming 80-column display)"""
      return s if len(s) <= 80 else s[:77] + "..."

  if opts.use_hashing:
      feature_names = None
  else:
      feature_names = np.asarray(vectorizer.get_feature_names())


  loop = 1
  while True:
    results = []
    results.append(benchmark(X_train,y_train,X_test,y_test))

    train_data = train_data + test_data
    train_target = train_target + test_target
    X_train = vectorizer.transform(train_data)
    y_train = train_target

    if(loop == 1):
      print('=' * 80)
      indices = np.arange(len(results))
      results = [[x[i] for x in results] for i in range(4)]

      clf_names, score, training_time, test_time = results
      training_time = np.array(training_time) / np.max(training_time)
      test_time = np.array(test_time) / np.max(test_time)

      pl.figure(figsize=(8,4))
      pl.title("Score")
      pl.barh(indices, score, .2, label="score", color='r')
      pl.barh(indices + .3, training_time, .2, label="training time", color='g')
      pl.barh(indices + .6, test_time, .2, label="test time", color='b')
      pl.yticks(())
      pl.legend(loc='best')
      pl.subplots_adjust(left=.25)
      pl.subplots_adjust(top=.95)
      pl.subplots_adjust(bottom=.05)

      for i, c in zip(indices, clf_names):
          pl.text(-.3, i, c)

      pl.show()
      loop = loop + 1