Ejemplo n.º 1
0
    def classify(self, topic):
        # Prepare processing functions
        hashMe = texthasher.hash(self.topicHasher, learn=False)
        hashtagMe = taghasher.hash(self.tagHasher, learn=False)
        classifyMe = cluster.analyze(self.clf)

        v = hashMe(str(topic['title'] + topic['topic']))
        t = ' '.join([tag for tag in topic['tags'] if len(tag) > 1])
        w = hashtagMe(t)
        x = [list(a) + list(b) for a, b in zip(list(v), list(w))]

        y = classifyMe(x)
        return y
Ejemplo n.º 2
0
  def classify(self,topic):
    # Prepare processing functions
    hashMe     = texthasher.hash(self.topicHasher,learn=False)
    hashtagMe  = taghasher.hash(self.tagHasher,learn=False)
    classifyMe = cluster.analyze(self.clf)

    v = hashMe(str(topic['title'] + topic['topic']))
    t = ' '.join([tag for tag in topic['tags'] if len(tag)>1])
    w = hashtagMe(t)
    x = [list(a)+list(b) for a,b in zip(list(v),list(w))]

    y = classifyMe(x)
    return y
Ejemplo n.º 3
0
	def _classify(textsrc):
		print(colored('[Classifying]...','green'))
		# Prepare operations
		hashMe     = texthasher.hash(topicHasher,learn=False)
		clusterMe  = textcluster.classify(contentClf,learn=False)
		hashtagMe  = taghasher.hash(tagHasher,learn=False)
		classifyMe = cluster.analyze(clf)
	
		iterX = DP.pipe(
			[take_x1(x) for x in textsrc],
			dests=None,
			transform=hashMe,
			title='Vectorisation'
		)

		vecX = [x for x in iterX]

		clusters = DP.pipe(
			[x for x in vecX],
			dests=None,
			transform=clusterMe,
			title='Clustering'
		)

		vectags   = DP.pipe(
			[take_tags(x) for x in textsrc],
			dests=None,
			transform=hashtagMe,
			title='Tag Vectorising'
		)

		XS = zip(
			list(vectags),
			[[i] for i in clusters], # Make scalar a single-element vector
			list(vecX)
		)
		X = [list(a) + list(b) + list(c) for a,b,c in XS]

		# Analyse 
		Y_ = classifyMe(X)

		# Returns the results as tuples
		return zip(Y_,X)
Ejemplo n.º 4
0
    def _classify(textsrc):
        print(colored('[Classifying]...', 'green'))
        # Prepare operations
        hashMe = texthasher.hash(topicHasher, learn=False)
        clusterMe = textcluster.classify(contentClf, learn=False)
        hashtagMe = taghasher.hash(tagHasher, learn=False)
        classifyMe = cluster.analyze(clf)

        iterX = DP.pipe([take_x1(x) for x in textsrc],
                        dests=None,
                        transform=hashMe,
                        title='Vectorisation')

        vecX = [x for x in iterX]

        clusters = DP.pipe([x for x in vecX],
                           dests=None,
                           transform=clusterMe,
                           title='Clustering')

        vectags = DP.pipe([take_tags(x) for x in textsrc],
                          dests=None,
                          transform=hashtagMe,
                          title='Tag Vectorising')

        XS = zip(
            list(vectags),
            [[i] for i in clusters],  # Make scalar a single-element vector
            list(vecX))
        X = [list(a) + list(b) + list(c) for a, b, c in XS]

        # Analyse
        Y_ = classifyMe(X)

        # Returns the results as tuples
        return zip(Y_, X)
Ejemplo n.º 5
0
def train_sentiment_capture(stopwords,save=False):

	"""
	STEP#1 :: Cluster topic with unsupervised classification

		X1 text ---> [@cluster] ----> (y1 group, X1 text)

	STEP#2 :: Combine topic, tags, and group to make feature vector

		X2 <--  [tags, y1, X1]
		Y2 <--  Sentiment score

	STEP#3 :: Train the classification

		(Y2,X2) -----> [@classification] ----> @model

	"""

	print(colored('==============================','cyan'))
	print(colored('  SENTIMENT TRAINING','cyan'))
	print()
	print(colored('  DIM   : {0}'.format(args['dim']),'cyan'))
	print(colored('  K     : {0}'.format(args['kcluster']),'cyan'))
	print(colored('  TAG   : {0}'.format(args['tagdim']),'cyan'))
	print(colored('==============================','cyan'))

	# STEP#1
	#------------------------------------
	# Vectorise the input topic (text only) 
	mqx1     = rabbit.create('localhost','pantip-x1')
	topicHasher = texthasher.safe_load(
		TEXT_VECTORIZER_PATH,
		n_components=args['dim'],
		stop_words=stopwords,
		decomposition='SVD'
	)
	hashMe = texthasher.hash(topicHasher,learn=True)

	print(colored('#STEP-1 started ...','cyan'))
	print('hasher : {0}'.format(topicHasher))
	iterX = DP.pipe(
		rabbit.iter(mqx1,take_x1),
		dests=None,
		transform=hashMe,
		title='Vectorisation'
	)

	rabbit.end(mqx1)

	vecX = [x for x in iterX]

	# Cluster the vectorised records with unsupervised clf
	contentClf = textcluster.safe_load(
		CONTENT_CLUSTER_PATH,
		n_labels=args['kcluster']
	)
	clusterMe  = textcluster.classify(contentClf,learn=True)

	# Classification doesn't accept a generator,
	# So we need to roll the matrix out of the MQ
	clusters = DP.pipe(
		[x for x in vecX],
		dests=None,
		transform=clusterMe,
		title='Clustering'
	)

	print(colored('#STEP-1 finished ...','cyan'))


	# STEP#2
	# ---------------------------------------------
	# Vectorise tags	
	
	# Convert tags into a numeric vector
	tagHasher = taghasher.safe_load(
		TAG_HASHER_PATH,
		n_feature=args['tagdim']
	)
	mqx2      = rabbit.create('localhost','pantip-x2')
	hashtagMe = taghasher.hash(tagHasher,learn=True)
	vectags   = DP.pipe(
		[tag for tag in rabbit.iter(mqx2,take_tags)],
		dests=None,
		transform=hashtagMe,
		title='Tag Vectorising'
	)

	rabbit.end(mqx2)	
	
	# STEP#3
	#----------------------------------------
	# Join each of the component together
	# Assembly a training vector
	mqy = rabbit.create('localhost','pantip-x3')
	Y = [y for y in rabbit.iter(mqy,take_sentiment_score)]

	XS = zip(
		list(vectags),
		[[i] for i in clusters], # Make scalar a single-element vector
		list(vecX)
	)

	X = [list(a) + list(b) + list(c) for a,b,c in XS]

	rabbit.end(mqy)

	# Train!
	print(colored('Training process started...','cyan'))


	clf     = cluster.safe_load(CLF_PATH)
	trainMe = cluster.analyze(clf,labels=Y)
	Y_      = trainMe(X)
	print(colored('[DONE]','yellow'))

	# Self-validation
	num_correct  = len([1 for y,y0 in zip(Y_,Y) if y==y0])
	predict_rate = 100*float(num_correct)/float(len(Y))
	print(colored('====== TRAINING LABELS =====','magenta'))
	print(Y)
	print(colored('========= PREDICTED ========','magenta'))
	print(list(Y_))
	print(colored('=========== RESULTS ========','magenta'))
	print('    overall accuracy:   {0:.2f} %'.format(predict_rate))

	# Report accuracy by each of the labels
	labels = list(set(Y_))
	lbl_predict_rate = []
	for lbl in labels:
		samples = [(y,y0) for y,y0 in zip(Y_,Y) if y0==lbl]
		num_correct = len([1 for y,y0 in samples if y==y0])
		num_all     = len(samples)
		accuracy    = 100*float(num_correct)/float(num_all)
		
		print('    accuracy class #{0} :    {1:.2f} % (out of {2} cases)'.format(lbl,accuracy,num_all))
		lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7))
	
	
	# Record the training accuracy to the CSV
	with open(CSV_REPORT_PATH,'a') as csv:
		csv.write('{0},{1},{2},{3},{4}\n'.format(
			str(args['dim']).center(4), #0
			str(args['kcluster']).center(3), #1,
			str(args['tagdim']).center(5), #2
			'{0:.2f}'.format(predict_rate).center(7), #3
			','.join(lbl_predict_rate) #4
		))
	

	#Save the trained models
	if save:
		print(colored('Saving models...','cyan'))
		texthasher.save(topicHasher,TEXT_VECTORIZER_PATH)
		textcluster.save(contentClf,CONTENT_CLUSTER_PATH)
		taghasher.save(tagHasher,TAG_HASHER_PATH)
		cluster.save(clf,CLF_PATH)
		print(colored('[DONE]','green'))
Ejemplo n.º 6
0
def train_sentiment_capture(stopwords, save=False):
    """
	STEP#1 :: Cluster topic with unsupervised classification

		X1 text ---> [@cluster] ----> (y1 group, X1 text)

	STEP#2 :: Combine topic, tags, and group to make feature vector

		X2 <--  [tags, y1, X1]
		Y2 <--  Sentiment score

	STEP#3 :: Train the classification

		(Y2,X2) -----> [@classification] ----> @model

	"""

    print(colored('==============================', 'cyan'))
    print(colored('  SENTIMENT TRAINING', 'cyan'))
    print()
    print(colored('  DIM   : {0}'.format(args['dim']), 'cyan'))
    print(colored('  K     : {0}'.format(args['kcluster']), 'cyan'))
    print(colored('  TAG   : {0}'.format(args['tagdim']), 'cyan'))
    print(colored('==============================', 'cyan'))

    # STEP#1
    #------------------------------------
    # Vectorise the input topic (text only)
    mqx1 = rabbit.create('localhost', 'pantip-x1')
    topicHasher = texthasher.safe_load(TEXT_VECTORIZER_PATH,
                                       n_components=args['dim'],
                                       stop_words=stopwords,
                                       decomposition='SVD')
    hashMe = texthasher.hash(topicHasher, learn=True)

    print(colored('#STEP-1 started ...', 'cyan'))
    print('hasher : {0}'.format(topicHasher))
    iterX = DP.pipe(rabbit.iter(mqx1, take_x1),
                    dests=None,
                    transform=hashMe,
                    title='Vectorisation')

    rabbit.end(mqx1)

    vecX = [x for x in iterX]

    # Cluster the vectorised records with unsupervised clf
    contentClf = textcluster.safe_load(CONTENT_CLUSTER_PATH,
                                       n_labels=args['kcluster'])
    clusterMe = textcluster.classify(contentClf, learn=True)

    # Classification doesn't accept a generator,
    # So we need to roll the matrix out of the MQ
    clusters = DP.pipe([x for x in vecX],
                       dests=None,
                       transform=clusterMe,
                       title='Clustering')

    print(colored('#STEP-1 finished ...', 'cyan'))

    # STEP#2
    # ---------------------------------------------
    # Vectorise tags

    # Convert tags into a numeric vector
    tagHasher = taghasher.safe_load(TAG_HASHER_PATH, n_feature=args['tagdim'])
    mqx2 = rabbit.create('localhost', 'pantip-x2')
    hashtagMe = taghasher.hash(tagHasher, learn=True)
    vectags = DP.pipe([tag for tag in rabbit.iter(mqx2, take_tags)],
                      dests=None,
                      transform=hashtagMe,
                      title='Tag Vectorising')

    rabbit.end(mqx2)

    # STEP#3
    #----------------------------------------
    # Join each of the component together
    # Assembly a training vector
    mqy = rabbit.create('localhost', 'pantip-x3')
    Y = [y for y in rabbit.iter(mqy, take_sentiment_score)]

    XS = zip(
        list(vectags),
        [[i] for i in clusters],  # Make scalar a single-element vector
        list(vecX))

    X = [list(a) + list(b) + list(c) for a, b, c in XS]

    rabbit.end(mqy)

    # Train!
    print(colored('Training process started...', 'cyan'))

    clf = cluster.safe_load(CLF_PATH)
    trainMe = cluster.analyze(clf, labels=Y)
    Y_ = trainMe(X)
    print(colored('[DONE]', 'yellow'))

    # Self-validation
    num_correct = len([1 for y, y0 in zip(Y_, Y) if y == y0])
    predict_rate = 100 * float(num_correct) / float(len(Y))
    print(colored('====== TRAINING LABELS =====', 'magenta'))
    print(Y)
    print(colored('========= PREDICTED ========', 'magenta'))
    print(list(Y_))
    print(colored('=========== RESULTS ========', 'magenta'))
    print('    overall accuracy:   {0:.2f} %'.format(predict_rate))

    # Report accuracy by each of the labels
    labels = list(set(Y_))
    lbl_predict_rate = []
    for lbl in labels:
        samples = [(y, y0) for y, y0 in zip(Y_, Y) if y0 == lbl]
        num_correct = len([1 for y, y0 in samples if y == y0])
        num_all = len(samples)
        accuracy = 100 * float(num_correct) / float(num_all)

        print(
            '    accuracy class #{0} :    {1:.2f} % (out of {2} cases)'.format(
                lbl, accuracy, num_all))
        lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7))

    # Record the training accuracy to the CSV
    with open(CSV_REPORT_PATH, 'a') as csv:
        csv.write('{0},{1},{2},{3},{4}\n'.format(
            str(args['dim']).center(4),  #0
            str(args['kcluster']).center(3),  #1,
            str(args['tagdim']).center(5),  #2
            '{0:.2f}'.format(predict_rate).center(7),  #3
            ','.join(lbl_predict_rate)  #4
        ))

    #Save the trained models
    if save:
        print(colored('Saving models...', 'cyan'))
        texthasher.save(topicHasher, TEXT_VECTORIZER_PATH)
        textcluster.save(contentClf, CONTENT_CLUSTER_PATH)
        taghasher.save(tagHasher, TAG_HASHER_PATH)
        cluster.save(clf, CLF_PATH)
        print(colored('[DONE]', 'green'))
Ejemplo n.º 7
0
def train_sentiment_capture(stopwords, save=False):

    print(colored('==============================', 'cyan'))
    print(colored('  SENTIMENT TRAINING', 'cyan'))
    print()
    print(
        colored(
            '  DECOMPOSITION            : {0} => {1} components'.format(
                args['decom'], args['n']), 'cyan'))
    print(
        colored('  DIMENSION OF FEATURE     : {0}'.format(args['feat']),
                'cyan'))
    print(
        colored('  MAX LENGTH OF TAG VECTOR : {0}'.format(args['tagdim']),
                'cyan'))
    print(colored('==============================', 'cyan'))

    # STEP#1 : [text] => [numeric vectors]
    #------------------------------------
    # Vectorise the input topic (text only)
    mqx1 = rabbit.create('localhost', 'pantip-x1')
    topicHasher = texthasher.safe_load(TEXT_VECTORIZER_PATH,
                                       stop_words=stopwords,
                                       decomposition=args['decom'],
                                       n_components=args['n'])
    hashMe = texthasher.hash(topicHasher, learn=True)

    print(colored('#STEP-1 started ...', 'cyan'))
    print('hasher : {0}'.format(topicHasher))
    iterX = DP.pipe(rabbit.iter(mqx1, take_x1),
                    dests=None,
                    transform=hashMe,
                    title='Vectorisation')

    rabbit.end(mqx1)

    vecX = [x for x in iterX]

    print(colored('#STEP-1 finished ...', 'cyan'))

    # STEP#2 : [tags] => [numeric vectors]
    # ---------------------------------------------
    # Vectorise tags

    # Convert tags into a numeric vector
    tagHasher = taghasher.safe_load(TAG_HASHER_PATH, n_feature=args['tagdim'])
    mqx2 = rabbit.create('localhost', 'pantip-x2')
    hashtagMe = taghasher.hash(tagHasher, learn=True)
    vectags = DP.pipe([tag for tag in rabbit.iter(mqx2, take_tags)],
                      dests=None,
                      transform=hashtagMe,
                      title='Tag Vectorising')

    rabbit.end(mqx2)

    # STEP#3 : [X] = [vectorised text] : [vectorised tags]
    #----------------------------------------
    # Join each of the component together
    # Assembly a training vector
    mqy = rabbit.create('localhost', 'pantip-x3')
    Y = [y for y in rabbit.iter(mqy, take_sentiment_score)]

    XS = zip(list(vectags), list(vecX))

    X = [list(a) + list(b) for a, b in XS]

    rabbit.end(mqy)

    # Train!
    print(colored('Training process started...', 'cyan'))
    clf = cluster.safe_load(CLF_PATH, args['cluster'], args['feat'])
    trainMe = cluster.analyze(clf, labels=Y)
    (Yact, Ypred) = trainMe(X, test_ratio=0.33)
    print(colored('[DONE]', 'yellow'))

    # Cross validation
    num_correct_all = 0

    # Report accuracy by each of the labels
    labels = list(set(Yact))
    lbl_predict_rate = []
    for lbl in labels:
        samples = [(y, y0) for y, y0 in zip(Ypred, Yact) if y0 == lbl]
        num_correct = len([1 for y, y0 in samples if y == y0])
        num_all = len(samples)
        accuracy = 100 * float(num_correct) / float(num_all)
        num_correct_all += num_correct

        print(
            '    accuracy class #{0} :    {1:.2f} % (out of {2} cases)'.format(
                lbl, accuracy, num_all))
        lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7))

    # Report overall performance
    predict_rate = 100 * float(num_correct_all) / float(len(Yact))
    print(colored('=========== CV PERFORMANCE ========', 'magenta'))
    print('    overall accuracy:   {0:.2f} %'.format(predict_rate))

    # Record the training accuracy to the CSV
    with open(CSV_REPORT_PATH, 'a') as csv:
        csv.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
            str(args['cluster']).center(11),  #0
            str(args['decom']).center(7),  #1
            str(args['n']).center(5),  #2
            str(args['feat']).center(5),  #3
            str(args['tagdim']).center(5),  #4
            '{0:.2f}'.format(predict_rate).center(7),  #5
            ','.join(lbl_predict_rate)  #6
        ))

    #Save the trained models
    if save:
        print(colored('Saving models...', 'cyan'))
        taghasher.save(tagHasher, TAG_HASHER_PATH)
        cluster.save(clf, CLF_PATH)
        texthasher.save(topicHasher, TEXT_VECTORIZER_PATH)
        print(colored('[DONE]', 'green'))