Ejemplo n.º 1
0
def train_SaveClassifierRandom(posURLs,negURLs,classifierFileName):
        
    posDocs = getWebpageText(posURLs)
    posDocs = [d['title'] + " " + d['text'] for d in posDocs if d]
    
    negDocs = getWebpageText(negURLs)
    negDocs = [d['title'] + " " + d['text'] for d in negDocs if d]
    
    posLen = len(posDocs)
    print posLen
    negLen = len(negDocs)
    print negLen
    posLabels = [1]* posLen
    negLabels = [0]*negLen 
    
    
    
    dataSetDocs = posDocs + negDocs
    dataSetLabels = posLabels + negLabels
    
    dataDocLabels = zip(dataSetDocs,dataSetLabels)
    random.shuffle(dataDocLabels)
    
    sep = int(0.7*len(dataDocLabels))
    trainingDocLabels = dataDocLabels[:sep]
    testDocLabels = dataDocLabels[sep:]
    
    trainingLabels = [v for _,v in trainingDocLabels]
    trainingDocs = [k for k,_ in trainingDocLabels]
    
    testDocs = [d for d,_ in testDocLabels]
    test_labels=[l for _,l in testDocLabels]
    
    classifier = NaiveBayesClassifier()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    print classifier.score(trainingDocs, trainingLabelsArr)
    print metrics.classification_report(trainingLabelsArr, classifier.predicted)
       
    test_labelsArr = np.array(test_labels)
    print classifier.score(testDocs, test_labelsArr)
    
    
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    classifierFile = open(classifierFileName,"wb")
    pickle.dump(classifier,classifierFile)
    classifierFile.close()
    return classifier
Ejemplo n.º 2
0
def train_SaveClassifier(posURLs,negURLs,classifierFileName):
        
    #posDocs = getWebpageText(posURLs)
    posDocs = getWebpageText_NoURLs(posURLs)
    posDocs = [d['text'] for d in posDocs if d]
    
    #negDocs = getWebpageText(negURLs)
    negDocs = getWebpageText_NoURLs(negURLs)
    negDocs = [d['text'] for d in negDocs if d]
    
    #negTraining = [d['title'] + " " + d['text'] for d in negTraining if d]
    #negTesting = [d['title'] + " " + d['text'] for d in negTesting if d]
    
    posLen = len(posDocs)
    posSep = int(0.7*posLen)
    posTraining = posDocs[:posSep]
    posTest = posDocs[posSep:]
    
    negLen = len(negDocs)
    negSep = int(0.7*negLen)
    negTraining = negDocs[:negSep]
    negTest = negDocs[negSep:]
    
    trainingDocs = posTraining + negTraining
    trainingLabels = [1]* len(posTraining) + [0]*len(negTraining)
    
    testingDocs = posTest + negTest
    testingLabels = [1]*len(posTest) + [0]*len(negTest)
        
    classifier = NaiveBayesClassifier()
    #classifier = SVMClassifier()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    print classifier.score(trainingDocs, trainingLabelsArr)
    print metrics.classification_report(trainingLabelsArr, classifier.predicted)
       
    test_labelsArr = np.array(testingLabels)
    print classifier.score(testingDocs, test_labelsArr)
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    
    #print classifier.classifier.feature_log_prob_
    #print classifier.classifier.coef_
    
    classifierFile = open(classifierFileName,"wb")
    pickle.dump(classifier,classifierFile)
    classifierFile.close()
    return classifier
Ejemplo n.º 3
0
def train_SaveClassifierFolder(posURLs,negURLs,classifierFileName):
        
    posDocs = getWebpageText(posURLs)
    posDocs = [d['title'] + " " + d['text'] for d in posDocs if d]
    
    negDocsList = []
    for n in negURLs:
        negDocsList.append(getWebpageText(n))
    
    negTraining = []
    negTesting =[]
    for nu in negDocsList:
        ns = int(len(nu)*0.7)
        negTraining.extend(nu[:ns])
        negTesting.extend(nu[ns:])
    
    negTraining = [d['title'] + " " + d['text'] for d in negTraining if d]
    negTesting = [d['title'] + " " + d['text'] for d in negTesting if d]
    
    
    posLen = len(posDocs)
    posSep = int(0.7*posLen)
    posTraining = posDocs[:posSep]
    posTest = posDocs[posSep:]
    
    trainingDocs = posTraining + negTraining
    trainingLabels = [1]* len(posTraining) + [0]*len(negTraining)
    
    testingDocs = posTest + negTesting
    testingLabels = [1]*len(posTest) + [0]*len(negTesting)
        
    classifier = NaiveBayesClassifier()
    #classifier = SVMClassifier()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    print classifier.score(trainingDocs, trainingLabelsArr)
    print metrics.classification_report(trainingLabelsArr, classifier.predicted)
       
    test_labelsArr = np.array(testingLabels)
    print classifier.score(testingDocs, test_labelsArr)
    
    
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    classifierFile = open(classifierFileName,"wb")
    pickle.dump(classifier,classifierFile)
    classifierFile.close()
    return classifier
Ejemplo n.º 4
0
def main():

    #seedUrls = ["http://www.huffingtonpost.com/news/arab-spring/","http://www.opendemocracy.net/david-hayes/arab-spring-protest-power-prospect","http://www.washingtonpost.com/wp-srv/special/world/middle-east-protests/"]
    #seedUrls = ["http://www.ndtv.com/article/india/big-earthquake-in-sikkim-tremors-across-india-54-dead-over-100-injured-134537",
    #           "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok",
    #            "http://www.ndtv.com/article/india/quake-aftermath-many-villages-in-sikkim-still-cut-off-thousands-waiting-for-help-135132",
    #            "http://www.ndtv.com/article/india/12-dead-40-missing-at-sikkim-plant-hit-by-quake-135215"
    #            ]
    seedUrls = [
        "http://www.ndtv.com/topic/sikkim-earthquake",
        "http://zeenews.india.com/tags/Sikkim_earthquake.html",
        "http://earthquake-report.com/2011/09/18/very-strong-earthquake-in-sikkim-india/",
        "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok"
    ]
    '''
    seedUrls = ["http://www.aljazeera.com/indepth/spotlight/anger-in-egypt/",
                "http://live.reuters.com/Event/Unrest_in_Egypt?Page=0",
                "http://www.guardian.co.uk/world/series/egypt-protests",
                "http://www.huffingtonpost.com/2012/06/24/egypt-uprising-election-timeline_n_1622773.html",
                "http://www.washingtonpost.com/wp-srv/world/special/egypt-transition-timeline/index.html",
                "http://botw.org/top/Regional/Africa/Egypt/Society_and_Culture/Politics/Protests_2011/"
                ]
    '''
    #topicKeywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military']
    ##topicKeywords = getTopicKeywords("manual-sikkim-earthquake-wikipedia.txt")
    urls_tokens = []
    title_tokens = []
    docs = getrawDocs("html_files2-balanced.txt", urls_tokens, title_tokens)
    #writeToFile(docs,"rawData.txt")
    print("raw docs extracted")
    docs_len = len(docs)
    #docs_tokens = getTokenizedDocs(docs)
    #print(" docs tokens extracted")
    #labels = getLabels(docs_tokens, topicKeywords)
    #writeToFile(labels,"labels.txt")
    labels = getLabelsFromFile("labels2-balanced.txt")
    print sum(labels)

    ##print("docs labels calcualted")

    sep = int(docs_len * 0.9)

    trainingDocs = docs[:sep]

    trainingLabels = labels[:sep]

    testDocs = docs[sep:]
    test_labels = labels[sep:]

    classifier = NaiveBayesClassifier()

    #classifier = SVMClassifier()

    trainingLabelsArr = np.array(labels)
    classifier.trainClassifier(docs, trainingLabelsArr)

    #print classifier.classifier.coef_
    #print classifier.ch2.get_support()

    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs, trainingLabelsArr)

    #print len(trainingDocs)
    #print len (trainingLabelsArr)
    #classifier.trainClassifier(trainingDocs,trainingLabels)

    #print("classifer trained")
    #print (classifier.classifier)
    #print sum(test_labels)

    test_labelsArr = np.array(test_labels)
    print classifier.score(testDocs, test_labelsArr)

    #print sum(classifier.predicted)
    #print classifier.score(testDocs, test_labels)

    print metrics.classification_report(test_labelsArr, classifier.predicted)
    '''
def main():
    conf = FCConfig("config.ini")

    seedUrls = linesFromFile(conf["seedFile"])
    repositoryDocNames = linesFromFile(conf["docsFile"])

    if conf["labelFile"]:
        print "Using labels"
        labels = intLinesFromFile(conf["labelFile"])
        relevantDocs = [doc for doc,lab in zip(repositoryDocNames, labels) if lab==1]
        irrelevantDocs = [doc for doc,lab in zip(repositoryDocNames, labels) if lab==0]     
    else:
        # use VSM model to label training docs
        vsmModel = None
        if conf["VSMFilterModel"].lower() == "tf-idf":
            vsmModel = TfidfScorer(getUrlTexts(seedUrls))
        elif conf["VSMFilterModel"].lower() == "lsi":
            vsmModel = LSIScorer(getUrlTexts(seedUrls))
        print "constructed vsm model"
    
        relevantDocs , irrelevantDocs = vsmModel.labelDocs(
            repositoryDocNames, conf["minRepositoryDocNum"],
            conf["filterIrrelevantThreshold"],
            conf["filterRelevantThreshold"])
        
    print len(relevantDocs), len(irrelevantDocs)
    
    
    
    # Train classifier
    classifier = None
    testSize = min(len(relevantDocs), len(irrelevantDocs))
    trainSize = conf["trainDocNum"]
    if (trainSize > testSize):
        raise Exception("Training size is larger than test size")
    trainDocs = relevantDocs[:trainSize] + irrelevantDocs[:trainSize]
    trainLabels = [1]*trainSize + [0]*trainSize
    if conf["classifier"].upper() == "NB":
        classifier = NaiveBayesClassifier()
    elif conf["classifier"].upper() == "SVM":
        classifier = SVMClassifier()
    classifier.trainClassifierFromNames(trainDocs, trainLabels)

    print "Training complete"
    
    # Test classifier
    testSize = min(len(relevantDocs), len(irrelevantDocs))
    testDocs = relevantDocs[:testSize] + irrelevantDocs[:testSize]
    testLabels = [1]*testSize + [0]*testSize
    predictedLabels = list(classifier.predictFromNames(testDocs))

    # Statistical analysis (recall and precision)
    allRelevant = testSize
    allIrrelevant = testSize
    predictedRelevant = predictedLabels.count(1)
    predictedIrrelevant = predictedLabels.count(0)
    correctlyRelevant = 0
    for i in range(0, testSize):
        if predictedLabels[i] == 1:
            correctlyRelevant += 1
    correctlyIrrelevant = 0
    for i in range(testSize, 2*testSize):
        if predictedLabels[i] == 0:
            correctlyIrrelevant += 1
    relevantRecall = float(correctlyRelevant) / allRelevant
    relevantPrecision = float(correctlyRelevant) / (predictedRelevant)
    irrelevantRecall = float(correctlyIrrelevant) / allIrrelevant
    irrelevantPrecision = float(correctlyIrrelevant) / (predictedIrrelevant)
    print relevantRecall, relevantPrecision


    [(-1,p) for p in seedUrls]
    priorityQueue = PriorityQueue(t)
    crawler = Crawler(priorityQueue,classifier,10)
    crawler.crawl()
    print crawler.relevantPagesCount

    print crawler.pagesCount
Ejemplo n.º 6
0
def main():
    
    #seedUrls = ["http://www.huffingtonpost.com/news/arab-spring/","http://www.opendemocracy.net/david-hayes/arab-spring-protest-power-prospect","http://www.washingtonpost.com/wp-srv/special/world/middle-east-protests/"]
    #seedUrls = ["http://www.ndtv.com/article/india/big-earthquake-in-sikkim-tremors-across-india-54-dead-over-100-injured-134537",
    #           "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok",
    #            "http://www.ndtv.com/article/india/quake-aftermath-many-villages-in-sikkim-still-cut-off-thousands-waiting-for-help-135132",
    #            "http://www.ndtv.com/article/india/12-dead-40-missing-at-sikkim-plant-hit-by-quake-135215"
    #            ]
    seedUrls = ["http://www.ndtv.com/topic/sikkim-earthquake",
                "http://zeenews.india.com/tags/Sikkim_earthquake.html",
                "http://earthquake-report.com/2011/09/18/very-strong-earthquake-in-sikkim-india/",
                "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok"
                ]
    '''
    seedUrls = ["http://www.aljazeera.com/indepth/spotlight/anger-in-egypt/",
                "http://live.reuters.com/Event/Unrest_in_Egypt?Page=0",
                "http://www.guardian.co.uk/world/series/egypt-protests",
                "http://www.huffingtonpost.com/2012/06/24/egypt-uprising-election-timeline_n_1622773.html",
                "http://www.washingtonpost.com/wp-srv/world/special/egypt-transition-timeline/index.html",
                "http://botw.org/top/Regional/Africa/Egypt/Society_and_Culture/Politics/Protests_2011/"
                ]
    '''
    #topicKeywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military']
    ##topicKeywords = getTopicKeywords("manual-sikkim-earthquake-wikipedia.txt")
    urls_tokens = []
    title_tokens = []
    docs = getrawDocs("html_files2-balanced.txt",urls_tokens, title_tokens)
    #writeToFile(docs,"rawData.txt")
    print("raw docs extracted")
    docs_len = len(docs)
    #docs_tokens = getTokenizedDocs(docs)
    #print(" docs tokens extracted")
    #labels = getLabels(docs_tokens, topicKeywords)
    #writeToFile(labels,"labels.txt")
    labels = getLabelsFromFile("labels2-balanced.txt")
    print sum(labels)
    
    ##print("docs labels calcualted")
    
    sep = int(docs_len*0.9)
    
    trainingDocs = docs[:sep]
    
    trainingLabels = labels[:sep]
    
    testDocs = docs[sep:]
    test_labels=labels[sep:]
    
    classifier = NaiveBayesClassifier()
    
    #classifier = SVMClassifier()
    
    trainingLabelsArr = np.array(labels)
    classifier.trainClassifier(docs,trainingLabelsArr)
    
    #print classifier.classifier.coef_
    #print classifier.ch2.get_support()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    #print len(trainingDocs) 
    #print len (trainingLabelsArr)
    #classifier.trainClassifier(trainingDocs,trainingLabels)
    
    #print("classifer trained")
    #print (classifier.classifier)
    #print sum(test_labels)
    
    test_labelsArr = np.array(test_labels)
    print classifier.score(testDocs, test_labelsArr)
    
    #print sum(classifier.predicted)
    #print classifier.score(testDocs, test_labels)
    
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    
    '''
Ejemplo n.º 7
0
def main():
    conf = FCConfig("config.ini")

    seedUrls = linesFromFile(conf["seedFile"])
    repositoryDocNames = linesFromFile(conf["docsFile"])

    if conf["labelFile"]:
        print "Using labels"
        labels = intLinesFromFile(conf["labelFile"])
        relevantDocs = [
            doc for doc, lab in zip(repositoryDocNames, labels) if lab == 1
        ]
        irrelevantDocs = [
            doc for doc, lab in zip(repositoryDocNames, labels) if lab == 0
        ]
    else:
        # use VSM model to label training docs
        vsmModel = None
        if conf["VSMFilterModel"].lower() == "tf-idf":
            vsmModel = TfidfScorer(getUrlTexts(seedUrls))
        elif conf["VSMFilterModel"].lower() == "lsi":
            vsmModel = LSIScorer(getUrlTexts(seedUrls))
        print "constructed vsm model"

        relevantDocs, irrelevantDocs = vsmModel.labelDocs(
            repositoryDocNames, conf["minRepositoryDocNum"],
            conf["filterIrrelevantThreshold"], conf["filterRelevantThreshold"])

    print len(relevantDocs), len(irrelevantDocs)

    # Train classifier
    classifier = None
    testSize = min(len(relevantDocs), len(irrelevantDocs))
    trainSize = conf["trainDocNum"]
    if (trainSize > testSize):
        raise Exception("Training size is larger than test size")
    trainDocs = relevantDocs[:trainSize] + irrelevantDocs[:trainSize]
    trainLabels = [1] * trainSize + [0] * trainSize
    if conf["classifier"].upper() == "NB":
        classifier = NaiveBayesClassifier()
    elif conf["classifier"].upper() == "SVM":
        classifier = SVMClassifier()
    classifier.trainClassifierFromNames(trainDocs, trainLabels)

    print "Training complete"

    # Test classifier
    testSize = min(len(relevantDocs), len(irrelevantDocs))
    testDocs = relevantDocs[:testSize] + irrelevantDocs[:testSize]
    testLabels = [1] * testSize + [0] * testSize
    predictedLabels = list(classifier.predictFromNames(testDocs))

    # Statistical analysis (recall and precision)
    allRelevant = testSize
    allIrrelevant = testSize
    predictedRelevant = predictedLabels.count(1)
    predictedIrrelevant = predictedLabels.count(0)
    correctlyRelevant = 0
    for i in range(0, testSize):
        if predictedLabels[i] == 1:
            correctlyRelevant += 1
    correctlyIrrelevant = 0
    for i in range(testSize, 2 * testSize):
        if predictedLabels[i] == 0:
            correctlyIrrelevant += 1
    relevantRecall = float(correctlyRelevant) / allRelevant
    relevantPrecision = float(correctlyRelevant) / (predictedRelevant)
    irrelevantRecall = float(correctlyIrrelevant) / allIrrelevant
    irrelevantPrecision = float(correctlyIrrelevant) / (predictedIrrelevant)
    print relevantRecall, relevantPrecision

    [(-1, p) for p in seedUrls]
    priorityQueue = PriorityQueue(t)
    crawler = Crawler(priorityQueue, classifier, 10)
    crawler.crawl()
    print crawler.relevantPagesCount

    print crawler.pagesCount