コード例 #1
0
def train_sentiment_capture(stopwords, save=False):
    """
	STEP#1 :: Cluster topic with unsupervised classification

		X1 text ---> [@cluster] ----> (y1 group, X1 text)

	STEP#2 :: Combine topic, tags, and group to make feature vector

		X2 <--  [tags, y1, X1]
		Y2 <--  Sentiment score

	STEP#3 :: Train the classification

		(Y2,X2) -----> [@classification] ----> @model

	"""

    print(colored('==============================', 'cyan'))
    print(colored('  SENTIMENT TRAINING', 'cyan'))
    print()
    print(colored('  DIM   : {0}'.format(args['dim']), 'cyan'))
    print(colored('  K     : {0}'.format(args['kcluster']), 'cyan'))
    print(colored('  TAG   : {0}'.format(args['tagdim']), 'cyan'))
    print(colored('==============================', 'cyan'))

    # STEP#1
    #------------------------------------
    # Vectorise the input topic (text only)
    mqx1 = rabbit.create('localhost', 'pantip-x1')
    topicHasher = texthasher.safe_load(TEXT_VECTORIZER_PATH,
                                       n_components=args['dim'],
                                       stop_words=stopwords,
                                       decomposition='SVD')
    hashMe = texthasher.hash(topicHasher, learn=True)

    print(colored('#STEP-1 started ...', 'cyan'))
    print('hasher : {0}'.format(topicHasher))
    iterX = DP.pipe(rabbit.iter(mqx1, take_x1),
                    dests=None,
                    transform=hashMe,
                    title='Vectorisation')

    rabbit.end(mqx1)

    vecX = [x for x in iterX]

    # Cluster the vectorised records with unsupervised clf
    contentClf = textcluster.safe_load(CONTENT_CLUSTER_PATH,
                                       n_labels=args['kcluster'])
    clusterMe = textcluster.classify(contentClf, learn=True)

    # Classification doesn't accept a generator,
    # So we need to roll the matrix out of the MQ
    clusters = DP.pipe([x for x in vecX],
                       dests=None,
                       transform=clusterMe,
                       title='Clustering')

    print(colored('#STEP-1 finished ...', 'cyan'))

    # STEP#2
    # ---------------------------------------------
    # Vectorise tags

    # Convert tags into a numeric vector
    tagHasher = taghasher.safe_load(TAG_HASHER_PATH, n_feature=args['tagdim'])
    mqx2 = rabbit.create('localhost', 'pantip-x2')
    hashtagMe = taghasher.hash(tagHasher, learn=True)
    vectags = DP.pipe([tag for tag in rabbit.iter(mqx2, take_tags)],
                      dests=None,
                      transform=hashtagMe,
                      title='Tag Vectorising')

    rabbit.end(mqx2)

    # STEP#3
    #----------------------------------------
    # Join each of the component together
    # Assembly a training vector
    mqy = rabbit.create('localhost', 'pantip-x3')
    Y = [y for y in rabbit.iter(mqy, take_sentiment_score)]

    XS = zip(
        list(vectags),
        [[i] for i in clusters],  # Make scalar a single-element vector
        list(vecX))

    X = [list(a) + list(b) + list(c) for a, b, c in XS]

    rabbit.end(mqy)

    # Train!
    print(colored('Training process started...', 'cyan'))

    clf = cluster.safe_load(CLF_PATH)
    trainMe = cluster.analyze(clf, labels=Y)
    Y_ = trainMe(X)
    print(colored('[DONE]', 'yellow'))

    # Self-validation
    num_correct = len([1 for y, y0 in zip(Y_, Y) if y == y0])
    predict_rate = 100 * float(num_correct) / float(len(Y))
    print(colored('====== TRAINING LABELS =====', 'magenta'))
    print(Y)
    print(colored('========= PREDICTED ========', 'magenta'))
    print(list(Y_))
    print(colored('=========== RESULTS ========', 'magenta'))
    print('    overall accuracy:   {0:.2f} %'.format(predict_rate))

    # Report accuracy by each of the labels
    labels = list(set(Y_))
    lbl_predict_rate = []
    for lbl in labels:
        samples = [(y, y0) for y, y0 in zip(Y_, Y) if y0 == lbl]
        num_correct = len([1 for y, y0 in samples if y == y0])
        num_all = len(samples)
        accuracy = 100 * float(num_correct) / float(num_all)

        print(
            '    accuracy class #{0} :    {1:.2f} % (out of {2} cases)'.format(
                lbl, accuracy, num_all))
        lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7))

    # Record the training accuracy to the CSV
    with open(CSV_REPORT_PATH, 'a') as csv:
        csv.write('{0},{1},{2},{3},{4}\n'.format(
            str(args['dim']).center(4),  #0
            str(args['kcluster']).center(3),  #1,
            str(args['tagdim']).center(5),  #2
            '{0:.2f}'.format(predict_rate).center(7),  #3
            ','.join(lbl_predict_rate)  #4
        ))

    #Save the trained models
    if save:
        print(colored('Saving models...', 'cyan'))
        texthasher.save(topicHasher, TEXT_VECTORIZER_PATH)
        textcluster.save(contentClf, CONTENT_CLUSTER_PATH)
        taghasher.save(tagHasher, TAG_HASHER_PATH)
        cluster.save(clf, CLF_PATH)
        print(colored('[DONE]', 'green'))
コード例 #2
0
ファイル: textprocess.py プロジェクト: kaorism/pantip-libr
def train_sentiment_capture(stopwords,save=False):

	"""
	STEP#1 :: Cluster topic with unsupervised classification

		X1 text ---> [@cluster] ----> (y1 group, X1 text)

	STEP#2 :: Combine topic, tags, and group to make feature vector

		X2 <--  [tags, y1, X1]
		Y2 <--  Sentiment score

	STEP#3 :: Train the classification

		(Y2,X2) -----> [@classification] ----> @model

	"""

	print(colored('==============================','cyan'))
	print(colored('  SENTIMENT TRAINING','cyan'))
	print()
	print(colored('  DIM   : {0}'.format(args['dim']),'cyan'))
	print(colored('  K     : {0}'.format(args['kcluster']),'cyan'))
	print(colored('  TAG   : {0}'.format(args['tagdim']),'cyan'))
	print(colored('==============================','cyan'))

	# STEP#1
	#------------------------------------
	# Vectorise the input topic (text only) 
	mqx1     = rabbit.create('localhost','pantip-x1')
	topicHasher = texthasher.safe_load(
		TEXT_VECTORIZER_PATH,
		n_components=args['dim'],
		stop_words=stopwords,
		decomposition='SVD'
	)
	hashMe = texthasher.hash(topicHasher,learn=True)

	print(colored('#STEP-1 started ...','cyan'))
	print('hasher : {0}'.format(topicHasher))
	iterX = DP.pipe(
		rabbit.iter(mqx1,take_x1),
		dests=None,
		transform=hashMe,
		title='Vectorisation'
	)

	rabbit.end(mqx1)

	vecX = [x for x in iterX]

	# Cluster the vectorised records with unsupervised clf
	contentClf = textcluster.safe_load(
		CONTENT_CLUSTER_PATH,
		n_labels=args['kcluster']
	)
	clusterMe  = textcluster.classify(contentClf,learn=True)

	# Classification doesn't accept a generator,
	# So we need to roll the matrix out of the MQ
	clusters = DP.pipe(
		[x for x in vecX],
		dests=None,
		transform=clusterMe,
		title='Clustering'
	)

	print(colored('#STEP-1 finished ...','cyan'))


	# STEP#2
	# ---------------------------------------------
	# Vectorise tags	
	
	# Convert tags into a numeric vector
	tagHasher = taghasher.safe_load(
		TAG_HASHER_PATH,
		n_feature=args['tagdim']
	)
	mqx2      = rabbit.create('localhost','pantip-x2')
	hashtagMe = taghasher.hash(tagHasher,learn=True)
	vectags   = DP.pipe(
		[tag for tag in rabbit.iter(mqx2,take_tags)],
		dests=None,
		transform=hashtagMe,
		title='Tag Vectorising'
	)

	rabbit.end(mqx2)	
	
	# STEP#3
	#----------------------------------------
	# Join each of the component together
	# Assembly a training vector
	mqy = rabbit.create('localhost','pantip-x3')
	Y = [y for y in rabbit.iter(mqy,take_sentiment_score)]

	XS = zip(
		list(vectags),
		[[i] for i in clusters], # Make scalar a single-element vector
		list(vecX)
	)

	X = [list(a) + list(b) + list(c) for a,b,c in XS]

	rabbit.end(mqy)

	# Train!
	print(colored('Training process started...','cyan'))


	clf     = cluster.safe_load(CLF_PATH)
	trainMe = cluster.analyze(clf,labels=Y)
	Y_      = trainMe(X)
	print(colored('[DONE]','yellow'))

	# Self-validation
	num_correct  = len([1 for y,y0 in zip(Y_,Y) if y==y0])
	predict_rate = 100*float(num_correct)/float(len(Y))
	print(colored('====== TRAINING LABELS =====','magenta'))
	print(Y)
	print(colored('========= PREDICTED ========','magenta'))
	print(list(Y_))
	print(colored('=========== RESULTS ========','magenta'))
	print('    overall accuracy:   {0:.2f} %'.format(predict_rate))

	# Report accuracy by each of the labels
	labels = list(set(Y_))
	lbl_predict_rate = []
	for lbl in labels:
		samples = [(y,y0) for y,y0 in zip(Y_,Y) if y0==lbl]
		num_correct = len([1 for y,y0 in samples if y==y0])
		num_all     = len(samples)
		accuracy    = 100*float(num_correct)/float(num_all)
		
		print('    accuracy class #{0} :    {1:.2f} % (out of {2} cases)'.format(lbl,accuracy,num_all))
		lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7))
	
	
	# Record the training accuracy to the CSV
	with open(CSV_REPORT_PATH,'a') as csv:
		csv.write('{0},{1},{2},{3},{4}\n'.format(
			str(args['dim']).center(4), #0
			str(args['kcluster']).center(3), #1,
			str(args['tagdim']).center(5), #2
			'{0:.2f}'.format(predict_rate).center(7), #3
			','.join(lbl_predict_rate) #4
		))
	

	#Save the trained models
	if save:
		print(colored('Saving models...','cyan'))
		texthasher.save(topicHasher,TEXT_VECTORIZER_PATH)
		textcluster.save(contentClf,CONTENT_CLUSTER_PATH)
		taghasher.save(tagHasher,TAG_HASHER_PATH)
		cluster.save(clf,CLF_PATH)
		print(colored('[DONE]','green'))
コード例 #3
0
ファイル: requeue.py プロジェクト: kaorism/pantip-libr
Source MQ requeue task
@starcolon projects
"""

from pypipe import pipe as Pipe
from pypipe.operations import rabbit
import json

if __name__ == '__main__':
	qsrc = rabbit.create('localhost','pantip-x0')
	qdst = [rabbit.create('localhost',q) for q in ['pantip-x1','pantip-x2','pantip-x3','pantip-x00']]

	# Requeue!
	print('Requeuing ...')
	for m in rabbit.iter(qsrc):
		rabbit.feed(qdst)(m)	
	
	# Bye all queues!
	rabbit.end_multiple(qdst)
	rabbit.end(qsrc)

	# Transfer from temp MQ#00 to MQ#0
	q00 = rabbit.create('localhost','pantip-x00')
	q0 = rabbit.create('localhost','pantip-x0')
	for m in rabbit.iter(q00):
		rabbit.feed([q0])(m)

	# Bye all queues!
	rabbit.end_multiple([q0,q00])

	print('[DONE] All input queues are recycled.')
コード例 #4
0
ファイル: process.py プロジェクト: tao-pr/pantip-libr
    time.sleep(1)

    # These are MQs we'll push preprocessed records to
    qs = ['pantip-x1', 'pantip-x2', 'pantip-x3', 'pantip-x0']
    mqs = [rabbit.create('localhost', q) for q in qs]

    # Prepare the processing pipeline (order matters)
    pipe = Pipe.new('preprocess', [])
    Pipe.push(pipe, preprocess.take)
    Pipe.push(pipe, rabbit.feed(mqs))
    Pipe.push(pipe, wordbag.feed(bag))
    Pipe.then(pipe, lambda out: print(colored('[DONE!]', 'cyan')))

    # Iterate through each record and processing
    couch.each_do(db, process_with(pipe), limit=40000)

    # Disconnect from the MQs
    [rabbit.end(mq) for mq in mqs]

    # Waiting for the background services
    # and kill `em
    terminate_background_services(workers)

    # Report the collected word bag
    print(colored('[Word bag]', 'green'))
    words = sorted(bag.items(), key=lambda b: -b[1])[:50]
    pprint(words)
    # Print most recurring words to file
    with open(WORD_BAG_DIR, 'w+') as txt:
        txt.writelines([w[0] + "\n" for w in words])
コード例 #5
0
ファイル: process.py プロジェクト: starcolon/pantip-libr
  # These are MQs we'll push preprocessed records to
  qs = ['pantip-x1','pantip-x2','pantip-x3','pantip-x0']
  mqs = [rabbit.create('localhost',q) for q in qs]

  # Prepare the processing pipeline (order matters)
  pipe = Pipe.new('preprocess',[])
  Pipe.push(pipe,preprocess.take)
  Pipe.push(pipe,rabbit.feed(mqs))
  Pipe.push(pipe,wordbag.feed(bag))
  Pipe.then(pipe,lambda out: print(colored('[DONE!]','cyan')))

  # Iterate through each record and processing
  couch.each_do(db,process_with(pipe),limit=40000)

  # Disconnect from the MQs
  [rabbit.end(mq) for mq in mqs]

  # Waiting for the background services
  # and kill `em
  terminate_background_services(workers)

  # Report the collected word bag
  print(colored('[Word bag]','green'))
  words = sorted(bag.items(),key=lambda b: -b[1])[:50]
  pprint(words)
  # Print most recurring words to file
  with open(WORD_BAG_DIR,'w+') as txt:
    txt.writelines([w[0] + "\n" for w in words])

コード例 #6
0
def train_sentiment_capture(stopwords, save=False):

    print(colored('==============================', 'cyan'))
    print(colored('  SENTIMENT TRAINING', 'cyan'))
    print()
    print(
        colored(
            '  DECOMPOSITION            : {0} => {1} components'.format(
                args['decom'], args['n']), 'cyan'))
    print(
        colored('  DIMENSION OF FEATURE     : {0}'.format(args['feat']),
                'cyan'))
    print(
        colored('  MAX LENGTH OF TAG VECTOR : {0}'.format(args['tagdim']),
                'cyan'))
    print(colored('==============================', 'cyan'))

    # STEP#1 : [text] => [numeric vectors]
    #------------------------------------
    # Vectorise the input topic (text only)
    mqx1 = rabbit.create('localhost', 'pantip-x1')
    topicHasher = texthasher.safe_load(TEXT_VECTORIZER_PATH,
                                       stop_words=stopwords,
                                       decomposition=args['decom'],
                                       n_components=args['n'])
    hashMe = texthasher.hash(topicHasher, learn=True)

    print(colored('#STEP-1 started ...', 'cyan'))
    print('hasher : {0}'.format(topicHasher))
    iterX = DP.pipe(rabbit.iter(mqx1, take_x1),
                    dests=None,
                    transform=hashMe,
                    title='Vectorisation')

    rabbit.end(mqx1)

    vecX = [x for x in iterX]

    print(colored('#STEP-1 finished ...', 'cyan'))

    # STEP#2 : [tags] => [numeric vectors]
    # ---------------------------------------------
    # Vectorise tags

    # Convert tags into a numeric vector
    tagHasher = taghasher.safe_load(TAG_HASHER_PATH, n_feature=args['tagdim'])
    mqx2 = rabbit.create('localhost', 'pantip-x2')
    hashtagMe = taghasher.hash(tagHasher, learn=True)
    vectags = DP.pipe([tag for tag in rabbit.iter(mqx2, take_tags)],
                      dests=None,
                      transform=hashtagMe,
                      title='Tag Vectorising')

    rabbit.end(mqx2)

    # STEP#3 : [X] = [vectorised text] : [vectorised tags]
    #----------------------------------------
    # Join each of the component together
    # Assembly a training vector
    mqy = rabbit.create('localhost', 'pantip-x3')
    Y = [y for y in rabbit.iter(mqy, take_sentiment_score)]

    XS = zip(list(vectags), list(vecX))

    X = [list(a) + list(b) for a, b in XS]

    rabbit.end(mqy)

    # Train!
    print(colored('Training process started...', 'cyan'))
    clf = cluster.safe_load(CLF_PATH, args['cluster'], args['feat'])
    trainMe = cluster.analyze(clf, labels=Y)
    (Yact, Ypred) = trainMe(X, test_ratio=0.33)
    print(colored('[DONE]', 'yellow'))

    # Cross validation
    num_correct_all = 0

    # Report accuracy by each of the labels
    labels = list(set(Yact))
    lbl_predict_rate = []
    for lbl in labels:
        samples = [(y, y0) for y, y0 in zip(Ypred, Yact) if y0 == lbl]
        num_correct = len([1 for y, y0 in samples if y == y0])
        num_all = len(samples)
        accuracy = 100 * float(num_correct) / float(num_all)
        num_correct_all += num_correct

        print(
            '    accuracy class #{0} :    {1:.2f} % (out of {2} cases)'.format(
                lbl, accuracy, num_all))
        lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7))

    # Report overall performance
    predict_rate = 100 * float(num_correct_all) / float(len(Yact))
    print(colored('=========== CV PERFORMANCE ========', 'magenta'))
    print('    overall accuracy:   {0:.2f} %'.format(predict_rate))

    # Record the training accuracy to the CSV
    with open(CSV_REPORT_PATH, 'a') as csv:
        csv.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
            str(args['cluster']).center(11),  #0
            str(args['decom']).center(7),  #1
            str(args['n']).center(5),  #2
            str(args['feat']).center(5),  #3
            str(args['tagdim']).center(5),  #4
            '{0:.2f}'.format(predict_rate).center(7),  #5
            ','.join(lbl_predict_rate)  #6
        ))

    #Save the trained models
    if save:
        print(colored('Saving models...', 'cyan'))
        taghasher.save(tagHasher, TAG_HASHER_PATH)
        cluster.save(clf, CLF_PATH)
        texthasher.save(topicHasher, TEXT_VECTORIZER_PATH)
        print(colored('[DONE]', 'green'))
コード例 #7
0
ファイル: requeue.py プロジェクト: kaorism/pantip-libr
from pypipe import pipe as Pipe
from pypipe.operations import rabbit
import json

if __name__ == '__main__':
    qsrc = rabbit.create('localhost', 'pantip-x0')
    qdst = [
        rabbit.create('localhost', q)
        for q in ['pantip-x1', 'pantip-x2', 'pantip-x3', 'pantip-x00']
    ]

    # Requeue!
    print('Requeuing ...')
    for m in rabbit.iter(qsrc):
        rabbit.feed(qdst)(m)

    # Bye all queues!
    rabbit.end_multiple(qdst)
    rabbit.end(qsrc)

    # Transfer from temp MQ#00 to MQ#0
    q00 = rabbit.create('localhost', 'pantip-x00')
    q0 = rabbit.create('localhost', 'pantip-x0')
    for m in rabbit.iter(q00):
        rabbit.feed([q0])(m)

    # Bye all queues!
    rabbit.end_multiple([q0, q00])

    print('[DONE] All input queues are recycled.')