def train_SaveClassifierRandom(posURLs,negURLs,classifierFileName): posDocs = getWebpageText(posURLs) posDocs = [d['title'] + " " + d['text'] for d in posDocs if d] negDocs = getWebpageText(negURLs) negDocs = [d['title'] + " " + d['text'] for d in negDocs if d] posLen = len(posDocs) print posLen negLen = len(negDocs) print negLen posLabels = [1]* posLen negLabels = [0]*negLen dataSetDocs = posDocs + negDocs dataSetLabels = posLabels + negLabels dataDocLabels = zip(dataSetDocs,dataSetLabels) random.shuffle(dataDocLabels) sep = int(0.7*len(dataDocLabels)) trainingDocLabels = dataDocLabels[:sep] testDocLabels = dataDocLabels[sep:] trainingLabels = [v for _,v in trainingDocLabels] trainingDocs = [k for k,_ in trainingDocLabels] testDocs = [d for d,_ in testDocLabels] test_labels=[l for _,l in testDocLabels] classifier = NaiveBayesClassifier() trainingLabelsArr = np.array(trainingLabels) classifier.trainClassifier(trainingDocs,trainingLabelsArr) print classifier.score(trainingDocs, trainingLabelsArr) print metrics.classification_report(trainingLabelsArr, classifier.predicted) test_labelsArr = np.array(test_labels) print classifier.score(testDocs, test_labelsArr) print metrics.classification_report(test_labelsArr, classifier.predicted) classifierFile = open(classifierFileName,"wb") pickle.dump(classifier,classifierFile) classifierFile.close() return classifier
def train_SaveClassifierFolder(posURLs,negURLs,classifierFileName): posDocs = getWebpageText(posURLs) posDocs = [d['title'] + " " + d['text'] for d in posDocs if d] negDocsList = [] for n in negURLs: negDocsList.append(getWebpageText(n)) negTraining = [] negTesting =[] for nu in negDocsList: ns = int(len(nu)*0.7) negTraining.extend(nu[:ns]) negTesting.extend(nu[ns:]) negTraining = [d['title'] + " " + d['text'] for d in negTraining if d] negTesting = [d['title'] + " " + d['text'] for d in negTesting if d] posLen = len(posDocs) posSep = int(0.7*posLen) posTraining = posDocs[:posSep] posTest = posDocs[posSep:] trainingDocs = posTraining + negTraining trainingLabels = [1]* len(posTraining) + [0]*len(negTraining) testingDocs = posTest + negTesting testingLabels = [1]*len(posTest) + [0]*len(negTesting) classifier = NaiveBayesClassifier() #classifier = SVMClassifier() trainingLabelsArr = np.array(trainingLabels) classifier.trainClassifier(trainingDocs,trainingLabelsArr) print classifier.score(trainingDocs, trainingLabelsArr) print metrics.classification_report(trainingLabelsArr, classifier.predicted) test_labelsArr = np.array(testingLabels) print classifier.score(testingDocs, test_labelsArr) print metrics.classification_report(test_labelsArr, classifier.predicted) classifierFile = open(classifierFileName,"wb") pickle.dump(classifier,classifierFile) classifierFile.close() return classifier
def train_SaveClassifier(posURLs,negURLs,classifierFileName): #posDocs = getWebpageText(posURLs) posDocs = getWebpageText_NoURLs(posURLs) posDocs = [d['text'] for d in posDocs if d] #negDocs = getWebpageText(negURLs) negDocs = getWebpageText_NoURLs(negURLs) negDocs = [d['text'] for d in negDocs if d] #negTraining = [d['title'] + " " + d['text'] for d in negTraining if d] #negTesting = [d['title'] + " " + d['text'] for d in negTesting if d] posLen = len(posDocs) posSep = int(0.7*posLen) posTraining = posDocs[:posSep] posTest = posDocs[posSep:] negLen = len(negDocs) negSep = int(0.7*negLen) negTraining = negDocs[:negSep] negTest = negDocs[negSep:] trainingDocs = posTraining + negTraining trainingLabels = [1]* len(posTraining) + [0]*len(negTraining) testingDocs = posTest + negTest testingLabels = [1]*len(posTest) + [0]*len(negTest) classifier = NaiveBayesClassifier() #classifier = SVMClassifier() trainingLabelsArr = np.array(trainingLabels) classifier.trainClassifier(trainingDocs,trainingLabelsArr) print classifier.score(trainingDocs, trainingLabelsArr) print metrics.classification_report(trainingLabelsArr, classifier.predicted) test_labelsArr = np.array(testingLabels) print classifier.score(testingDocs, test_labelsArr) print metrics.classification_report(test_labelsArr, classifier.predicted) #print classifier.classifier.feature_log_prob_ #print classifier.classifier.coef_ classifierFile = open(classifierFileName,"wb") pickle.dump(classifier,classifierFile) classifierFile.close() return classifier
def main(): #seedUrls = ["http://www.huffingtonpost.com/news/arab-spring/","http://www.opendemocracy.net/david-hayes/arab-spring-protest-power-prospect","http://www.washingtonpost.com/wp-srv/special/world/middle-east-protests/"] #seedUrls = ["http://www.ndtv.com/article/india/big-earthquake-in-sikkim-tremors-across-india-54-dead-over-100-injured-134537", # "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok", # "http://www.ndtv.com/article/india/quake-aftermath-many-villages-in-sikkim-still-cut-off-thousands-waiting-for-help-135132", # "http://www.ndtv.com/article/india/12-dead-40-missing-at-sikkim-plant-hit-by-quake-135215" # ] seedUrls = [ "http://www.ndtv.com/topic/sikkim-earthquake", "http://zeenews.india.com/tags/Sikkim_earthquake.html", "http://earthquake-report.com/2011/09/18/very-strong-earthquake-in-sikkim-india/", "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok" ] ''' seedUrls = ["http://www.aljazeera.com/indepth/spotlight/anger-in-egypt/", "http://live.reuters.com/Event/Unrest_in_Egypt?Page=0", "http://www.guardian.co.uk/world/series/egypt-protests", "http://www.huffingtonpost.com/2012/06/24/egypt-uprising-election-timeline_n_1622773.html", "http://www.washingtonpost.com/wp-srv/world/special/egypt-transition-timeline/index.html", "http://botw.org/top/Regional/Africa/Egypt/Society_and_Culture/Politics/Protests_2011/" ] ''' #topicKeywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military'] ##topicKeywords = getTopicKeywords("manual-sikkim-earthquake-wikipedia.txt") urls_tokens = [] title_tokens = [] docs = getrawDocs("html_files2-balanced.txt", urls_tokens, title_tokens) #writeToFile(docs,"rawData.txt") print("raw docs extracted") docs_len = len(docs) #docs_tokens = getTokenizedDocs(docs) #print(" docs tokens extracted") #labels = getLabels(docs_tokens, topicKeywords) #writeToFile(labels,"labels.txt") labels = getLabelsFromFile("labels2-balanced.txt") print sum(labels) ##print("docs labels calcualted") sep = int(docs_len * 0.9) trainingDocs = docs[:sep] trainingLabels = labels[:sep] testDocs = docs[sep:] test_labels = labels[sep:] classifier = NaiveBayesClassifier() #classifier = SVMClassifier() trainingLabelsArr = np.array(labels) classifier.trainClassifier(docs, trainingLabelsArr) #print classifier.classifier.coef_ #print classifier.ch2.get_support() trainingLabelsArr = np.array(trainingLabels) classifier.trainClassifier(trainingDocs, trainingLabelsArr) #print len(trainingDocs) #print len (trainingLabelsArr) #classifier.trainClassifier(trainingDocs,trainingLabels) #print("classifer trained") #print (classifier.classifier) #print sum(test_labels) test_labelsArr = np.array(test_labels) print classifier.score(testDocs, test_labelsArr) #print sum(classifier.predicted) #print classifier.score(testDocs, test_labels) print metrics.classification_report(test_labelsArr, classifier.predicted) '''
def main(): #seedUrls = ["http://www.huffingtonpost.com/news/arab-spring/","http://www.opendemocracy.net/david-hayes/arab-spring-protest-power-prospect","http://www.washingtonpost.com/wp-srv/special/world/middle-east-protests/"] #seedUrls = ["http://www.ndtv.com/article/india/big-earthquake-in-sikkim-tremors-across-india-54-dead-over-100-injured-134537", # "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok", # "http://www.ndtv.com/article/india/quake-aftermath-many-villages-in-sikkim-still-cut-off-thousands-waiting-for-help-135132", # "http://www.ndtv.com/article/india/12-dead-40-missing-at-sikkim-plant-hit-by-quake-135215" # ] seedUrls = ["http://www.ndtv.com/topic/sikkim-earthquake", "http://zeenews.india.com/tags/Sikkim_earthquake.html", "http://earthquake-report.com/2011/09/18/very-strong-earthquake-in-sikkim-india/", "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok" ] ''' seedUrls = ["http://www.aljazeera.com/indepth/spotlight/anger-in-egypt/", "http://live.reuters.com/Event/Unrest_in_Egypt?Page=0", "http://www.guardian.co.uk/world/series/egypt-protests", "http://www.huffingtonpost.com/2012/06/24/egypt-uprising-election-timeline_n_1622773.html", "http://www.washingtonpost.com/wp-srv/world/special/egypt-transition-timeline/index.html", "http://botw.org/top/Regional/Africa/Egypt/Society_and_Culture/Politics/Protests_2011/" ] ''' #topicKeywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military'] ##topicKeywords = getTopicKeywords("manual-sikkim-earthquake-wikipedia.txt") urls_tokens = [] title_tokens = [] docs = getrawDocs("html_files2-balanced.txt",urls_tokens, title_tokens) #writeToFile(docs,"rawData.txt") print("raw docs extracted") docs_len = len(docs) #docs_tokens = getTokenizedDocs(docs) #print(" docs tokens extracted") #labels = getLabels(docs_tokens, topicKeywords) #writeToFile(labels,"labels.txt") labels = getLabelsFromFile("labels2-balanced.txt") print sum(labels) ##print("docs labels calcualted") sep = int(docs_len*0.9) trainingDocs = docs[:sep] trainingLabels = labels[:sep] testDocs = docs[sep:] test_labels=labels[sep:] classifier = NaiveBayesClassifier() #classifier = SVMClassifier() trainingLabelsArr = np.array(labels) classifier.trainClassifier(docs,trainingLabelsArr) #print classifier.classifier.coef_ #print classifier.ch2.get_support() trainingLabelsArr = np.array(trainingLabels) classifier.trainClassifier(trainingDocs,trainingLabelsArr) #print len(trainingDocs) #print len (trainingLabelsArr) #classifier.trainClassifier(trainingDocs,trainingLabels) #print("classifer trained") #print (classifier.classifier) #print sum(test_labels) test_labelsArr = np.array(test_labels) print classifier.score(testDocs, test_labelsArr) #print sum(classifier.predicted) #print classifier.score(testDocs, test_labels) print metrics.classification_report(test_labelsArr, classifier.predicted) '''