def normal_test(data, type): print '----------------------------------------------------' print 'TEST FUNCTION STARTED FOR ' + type + '!' total_data_size = len(data) training_size = int(round(total_data_size/2)) test_size = training_size print 'Total Size: ' + str(total_data_size) print 'Training Size: ' + str(training_size) print 'Test Size: ' + str(test_size) print 'Training Started for ' + type + '!' classification_methods = { #uncomment based on what classification algorithm you would like to test 'NB' : NB(train=data[:training_size], baseline=MAJORITY, method=MULTINOMIAL), 'KNN2' : KNN(train=data[:training_size], baseline=MAJORITY, k=2, distance=COSINE), 'KNN3' : KNN(train=data[:training_size], baseline=MAJORITY, k=3, distance=COSINE), 'KNN4' : KNN(train=data[:training_size], baseline=MAJORITY, k=4, distance=COSINE), 'KNN5' : KNN(train=data[:training_size], baseline=MAJORITY, k=5, distance=COSINE), 'KNN6' : KNN(train=data[:training_size], baseline=MAJORITY, k=6, distance=COSINE), 'KNN7' : KNN(train=data[:training_size], baseline=MAJORITY, k=7, distance=COSINE), 'KNN8' : KNN(train=data[:training_size], baseline=MAJORITY, k=8, distance=COSINE), 'KNN9' : KNN(train=data[:training_size], baseline=MAJORITY, k=9, distance=COSINE), 'KNN10' : KNN(train=data[:training_size], baseline=MAJORITY, k=10, distance=COSINE), 'SLP1' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=1), 'SLP2' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=2), 'SLP3' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=3), 'SVM' : SVM(train=data[:training_size], type=CLASSIFICATION, kernel=POLYNOMIAL), } print 'Normal Testing Started!' # uncomment to start the normal test for classification in classification_methods.keys(): #measure the time it takes to classify! start = timeit.default_timer() #normal test accuracy, precision, recall, f1 = classification_methods[classification].test(data[training_size:training_size+test_size]) stop = timeit.default_timer() print '*' + classification + '*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'Time: ' + str(stop - start) print
def report(model_file, text_file): pl = Pipeline() model = KNN.load(model_file) nlp = en_core_web_sm.load() make_doc = lambda text: nlp(unidecode(text).strip()) text = [line for line in open(text_file) if line.strip()] docs = [make_doc(line) for line in text] sentences = [sent.text for doc in docs for sent in doc.sents] print('\n'.join(text)) predictions = pl.predict(model, sentences, print_pred=False) print('\n \n ############### Soft Skills ############\n') print( *[sent for sent, pred in zip(sentences, predictions) if 'yes' == pred], sep='\n')
def train(cls, train_file, model_file): sents_dic = (json.loads(jsonl) for jsonl in SoftSkills.load(train_file)) model = KNN() for sent in sents_dic: text = sent['text'] v = count([word for word, pos in tag(text)]) # {'sweet': 1} if v: model.train(v, type=sent['soft skill']) model.save(model_file) return model
# Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or # negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print("LSA reduction...") print() m.reduce(10) t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review.
s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.features) print # We can now ask it to classify documents containing these words. # Note that you may get different results than the ones below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. # If None is returned, the word was not recognized, # and the classifier returned the default value (see above). print classifier.classify('sweet') # yields 'WIN' print classifier.classify('stupid') # yields 'FAIL'
# -*- coding: utf-8 -*- """ Module with methods used for retreiving tweets, containing certain search words, from Twitter. """ from pattern.web import Twitter from pattern.vector import KNN import urllib2 twitter = Twitter(license=None, throttle=0.5) knn = KNN() def getSearchWords(): """Function that returns list of search words""" # Test data, we will use synonyms for war (found in nltk word net) # words = ['war', 'conflict', 'jihad', ] words = [] conflict = ['feud', 'vendetta', 'class struggle', 'strife', 'countercurrent', 'discord', 'trench warfare', 'fight', 'hassle', 'beating', 'in fighting', 'single combat', 'gunfight',
def _learner(self): return KNN()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Mar 25 19:37:34 2019 @author: alternatif """ from pattern.web import Twitter from pattern.en import tag from pattern.vector import KNN, count twitter, knn = Twitter(), KNN() for i in range(1, 3): for tweet in twitter.search('#win OR #fail', start=i, count=100): s = tweet.text.lower() p = '#win' in s and 'WIN' or 'FAIL' v = tag(s) v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective v = count(v) # {'sweet': 1} if v: knn.train(v, type=p) print(knn.classify('sweet potato burger')) print(knn.classify('stupid autocorrect'))
from pattern.web import Twitter from pattern.text.en import tag from pattern.vector import KNN, count, NaiveBayes, SVM import os, random import file_io as fio corp_dir = 'essays/original' twitter, knn, nbayes, svm = Twitter(), KNN(), NaiveBayes(), SVM() from nltk.corpus import stopwords import lsa cachedStopWords = stopwords.words("english") testSet = [] def naive(): trainingSet = [] l = lsa.getMod() dirs = [x[0] for x in os.walk(os.path.abspath(corp_dir))] for dir in dirs: label = 0 if 'low' in dir: label = -1 elif 'high' in dir: label = 1 tfiles = [] tfiles = fio.getTopLevelFiles(dir, extension='txt') train_smpl = [] if len(tfiles) > 0: train_smpl = [ tfiles[i] for i in random.sample(xrange(len(tfiles)), 13) ] for file in tfiles:
s = tweet.text.lower() # tweet in lowercase p = "#win" in s and "WIN" or "FAIL" # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search("JJ", s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.features) print # We can now ask it to classify documents containing these words. # Note that you may get different results than the ones below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. # If None is returned, the word was not recognized, # and the classifier returned the default value (see above). print classifier.classify("sweet potato burger") # yields 'WIN' print classifier.classify("stupid autocorrect") # yields 'FAIL'
def setup(): global pages global urlalias global revurlalias global knn pages = dict() urlalias = dict() revurlalias = dict() knn = KNN() db = MySQLdb.connect(host="192.168.200.26", user="******", passwd="xxxsecretxxx", db="pla") cur = db.cursor() cur.execute("select source, alias from url_alias") for row in cur.fetchall(): urlalias[row[1]] = row[0] revurlalias[row[0]] = row[1] cur.execute("select tid, name, description, vid from taxonomy_term_data;") for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] url = revurlalias[url] if row[3] == 3: soup = bs4.BeautifulSoup(row[2]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[1].lower()), url) cur.execute( "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);" ) for row in cur.fetchall(): url = 'taxonomy/term/' + str(row[0]) if url in revurlalias: url = revurlalias[url] soup = bs4.BeautifulSoup(row[1]) the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower() knn.train(Document(the_text, stemmer=PORTER), url) knn.train(Document(row[2].lower()), url) cur.execute("select nid, title from node where status=1;") for row in cur.fetchall(): url = 'node/' + str(row[0]) pages[url] = row[1] if url in revurlalias: pages[revurlalias[url]] = row[1] db.close() pgcur = conn.cursor() pgcur.execute( "select query, target from website_queries where target is not null group by query, target" ) for row in pgcur.fetchall(): words = re.split(r'[\n\r,;]+ *', row[1]) for word in words: print("training on " + row[0].lower() + " for " + word) knn.train(Document(row[0].lower()), word) conn.commit() pgcur.close()
print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum( len(d.terms) for d in corpus.documents) / float(len(corpus)) print # This may be too much words for some clustering algorithms (e.g., hierarchical). # We'll reduce the documents to vectors of 4 concepts. # First, let's test how the corpus would perform as a classifier. # The details of KNN are not that important right now, just observe the numbers. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words). print "LSA reduction..." print corpus.reduce(4) t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Not bad, accuracy is about the same but performance is 3x faster, # because each document is now a "4-word summary" of the original review.
s = tweet.text.lower() # tweet in lowercase p = "#win" in s and "WIN" or "FAIL" # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search("JJ", s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: corpus.append(Document(s, type=p, stemmer=None)) # Train k-nearest neighbor on the corpus. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN() for document in corpus: classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.terms) print # We can ask it to classify texts containing those words. # Note that you may get different results than the ones indicated below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. print classifier.classify("sweet") # yields 'WIN' print classifier.classify("stupid") # yields 'FAIL' # "What can I do with it?"
p = '#win' in s and 'WIN' or 'FAIL' # document labels # parse tree with part-of-speech tags s = Sentence(parse(s)) s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY # (classify unknown documents with the most frequent type). for document in m: classifier.train(document) # These are the adjectives the classifier has learned: print(sorted(classifier.features)) print() # We can now ask it to classify documents containing these words. # Note that you may get different results than the ones below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. # If None is returned, the word was not recognized, # and the classifier returned the default value (see above). print(classifier.classify('sweet potato burger')) # yields 'WIN'
): #searches 15*100=1500 tweets for these classes of hashtags p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL' m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL' s = tweet.description.lower() s = Sentence( parse(s) ) #parse anlayzes & gives strings that are annotated with specified tags s = search('JJ', s) #searches for adjectives in tweets (JJ = adjectiive) s = [match[0].string for match in s] s = ' '.join(s) if len(s) > 0: corpus.append(Document(s, type=p)) corpus.append(Document(s, type=m)) classifier = KNN() #k-nearest neighbor classifier = K-NN objects = [] for document in corpus: #documents are an unordered bag of given sentences. classifier.train( document) #adjective vectors in corpus trains the classifier objects.append(classifier.classify('awesome')) #predicts awesome as win objects.append(classifier.classify('cool')) #predicts cool as win objects.append(classifier.classify('damn')) #predicts damn as fail objects.append(classifier.classify('sucks')) #predicts sucks as fail print objects wincounter = 0 failcounter = 0 for thing in objects:
p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL' s = tweet.description.lower() # tweet in lowercase s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: corpus.append(Document(s, type=p, stemmer=None)) # Train k-nearest neighbor on the corpus. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN() for document in corpus: classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.terms) print # We can ask it to classify texts containing those words. # Note that you may get different results than the ones indicated below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. print classifier.classify('sweet') # yields 'WIN' print classifier.classify('stupid') # yields 'FAIL' # "What can I do with it?"
# We'll reduce the document vectors to 10 concepts. # Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(m, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print "LSA reduction..." print m.reduce(10) t = time.time() print "accuracy:", KNN.test(m, folds=10)[-1] print "time:", time.time() - t print # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review.
corpus = Corpus(documents) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus)) print # This may be too much words for some clustering algorithms (e.g., hierarchical). # We'll reduce the documents to vectors of 4 concepts. # First, let's test how the corpus would perform as a classifier. # The details of KNN are not that important right now, just observe the numbers. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words). print "LSA reduction..." print corpus.reduce(4) t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Not bad, accuracy is about the same but performance is 3x faster, # because each document is now a "4-word summary" of the original review.
# We'll reduce the document vectors to 10 concepts. # Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print("LSA reduction...") print() m.reduce(10) t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review.
x += 1 print "ERROR" print x / n print t1 print t2 #print xxx print len(corpus) print len(corpus.features) print len(corpus.documents[0].vector) from time import time t = time() print KNN.test(corpus, folds=10) print time()-t print "filter..." from time import time t = time() f = corpus.feature_selection(150, verbose=False) print f print time()-t corpus = corpus.filter(f) #corpus.reduce(300) #print len(corpus.lsa.vectors[corpus.documents[0].id]) #print corpus.lsa.vectors[corpus.documents[0].id] #print len(corpus)
s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document) # These are the adjectives the classifier has learned: print sorted(classifier.features) print # We can now ask it to classify documents containing these words. # Note that you may get different results than the ones below, # since you will be mining other (more recent) tweets. # Again, a robust classifier needs lots and lots of training data. # If None is returned, the word was not recognized, # and the classifier returned the default value (see above). print classifier.classify('sweet potato burger') # yields 'WIN' print classifier.classify('stupid autocorrect') # yields 'FAIL'
print 'Number of Negative Tweets:',len(neg_lines) print 'Number of Positive Tweets:',len(pos_lines) documents = [] for line in neg_lines: document = Document(line,stopword=True,stemmer=PORTER,type='0') documents.append(document) for line in pos_lines: document = Document(line,stopword=True,stemmer=PORTER,type='1') documents.append(document) corpus = Corpus(documents,weight=TFIDF) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus)) print #Filtering top 1000 features using Information Gain Criterion corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG))) # To test the accuracy of a classifier, Using 10-fold crossvalidation # This yields 4 scores: Accuracy, Precision, Recall and F-score. print 'classifying using KNN' print '-------------------------' print '(Accuracy, Precision,REcall,F-Measure)' print KNN.test(corpus,k=100,folds=10,distance=COSINE) f_neg.close() f_pos.close()