Ejemplo n.º 1
0
def parse(stat, path='mirror/',  n_news=10000):
    stopWord = StopWord.getStopWord()
    print(str(stopWord))
    lastDoc = []
    for number in range(1, n_news+1):
        filename = path + str(number) + '.txt'
        with open(filename, 'rb') as fin:
            if fin:
                s = fin.readline()    # title
                print(number, s)
                s = fin.readline()    # body
                termList = re.split('[^a-zA-Z]+', s)
                pass
                s = fin.readline()    # category
                if s in stat.cats:
                    for item in termList:
                        item = item.lower()
                        if not ((item in stopWord) or (len(item) == 1)):
                            stat.catTermAmount[stat.cats[s]] += 1
                            if not (item in stat.terms):
                                stat.termToInt[item] = len(stat.terms)
                                stat.terms.append(item)
                                stat.termInDoc.append(0)
                                stat.termAmount.append(0)
                                lastDoc.append(-1)
                            stat.totalTerm += 1
                            no = stat.termToInt[item]
                            if lastDoc[no] != number:
                                lastDoc[no] = number
                                stat.termInDoc[no] += 1
                            stat.termAmount[no] += 1
                            stat.termInCat[stat.cats[s]][no] += 1
Ejemplo n.º 2
0
def parse(stat, path='mirror/', n_news=10000):
    stopWord = StopWord.getStopWord()
    print(str(stopWord))
    lastDoc = []
    for number in range(1, n_news + 1):
        filename = path + str(number) + '.txt'
        with open(filename, 'rb') as fin:
            if fin:
                s = fin.readline()  # title
                print(number, s)
                s = fin.readline()  # body
                termList = re.split('[^a-zA-Z]+', s)
                pass
                s = fin.readline()  # category
                if s in stat.cats:
                    for item in termList:
                        item = item.lower()
                        if not ((item in stopWord) or (len(item) == 1)):
                            stat.catTermAmount[stat.cats[s]] += 1
                            if not (item in stat.terms):
                                stat.termToInt[item] = len(stat.terms)
                                stat.terms.append(item)
                                stat.termInDoc.append(0)
                                stat.termAmount.append(0)
                                lastDoc.append(-1)
                            stat.totalTerm += 1
                            no = stat.termToInt[item]
                            if lastDoc[no] != number:
                                lastDoc[no] = number
                                stat.termInDoc[no] += 1
                            stat.termAmount[no] += 1
                            stat.termInCat[stat.cats[s]][no] += 1
Ejemplo n.º 3
0
def test(stat, path='', n_test=10):
    allCat = {'Crime and law': 0, 'Culture and entertainment': 0, 'Disasters and accidents': 0,
              'Science and technology': 0, 'Health': 0}
    callBack = dict(allCat)
    callAll = dict(allCat)
    stopWord = StopWord.getStopWord()
    termSum = len(stat.terms)
    correct = 0
    wrong = 0
    for n in range(1, n_test+1):
        filename = path + str(n) + '.txt'

        with open(filename, 'rb') as fin:
            title = fin.readline().strip()
            termList = re.split('[^a-zA-Z]+', fin.readline())
            maxi = 0
            toCat = ''

            for cat in stat.cats:   #
                noC = stat.cats[cat]
                p = 0.0
                for t in termList:
                    t = t.lower()
                    if not (t in stopWord or len(t) == 1):
                        if t in stat.terms:
                            noT = stat.termToInt[t]
                            p += math.log(1.0 * (stat.termInCat[noC][noT] + 1) / (stat.catTermAmount[noC] + termSum))
                p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) / stat.totalTerm)
                if p > maxi or toCat == '':
                    maxi = p
                    toCat = cat

            cat = fin.readline().strip()
            if cat in stat.cats:
                allCat[cat] += 1
                callAll[toCat] += 1
                if toCat == cat:
                    callBack[cat] += 1
                    correct += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat + '  Yes')
                else:
                    wrong += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat + '  No')

    print('\nTotal Precision:  correct / total = %d / %d' % (correct, correct + wrong))
    for cat in allCat:
        print('[' + cat + ']')
        if callAll[cat] > 0:
            p = callBack[cat] * 100.0 / callAll[cat]
        else:
            p = -1
        if allCat[cat] > 0:
            r = callBack[cat] * 100.0 / allCat[cat]
        else:
            r = -1
        print('Precision : %d / %d = %.3f%%' % (callBack[cat], callAll[cat], p))
        print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r))
        print('F = %.3f%%' % (2.0 * p * r / (p + r)))
Ejemplo n.º 4
0
def test(stat, path='', n_test=10):
    allCat = {
        'Crime and law': 0,
        'Culture and entertainment': 0,
        'Disasters and accidents': 0,
        'Science and technology': 0,
        'Health': 0
    }
    callBack = dict(allCat)
    callAll = dict(allCat)
    stopWord = StopWord.getStopWord()
    termSum = len(stat.terms)
    correct = 0
    wrong = 0
    for n in range(1, n_test + 1):
        filename = path + str(n) + '.txt'

        with open(filename, 'rb') as fin:
            title = fin.readline().strip()
            termList = re.split('[^a-zA-Z]+', fin.readline())
            maxi = 0
            toCat = ''

            for cat in stat.cats:  #
                noC = stat.cats[cat]
                p = 0.0
                for t in termList:
                    t = t.lower()
                    if not (t in stopWord or len(t) == 1):
                        if t in stat.terms:
                            noT = stat.termToInt[t]
                            p += math.log(1.0 *
                                          (stat.termInCat[noC][noT] + 1) /
                                          (stat.catTermAmount[noC] + termSum))
                p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) /
                              stat.totalTerm)
                if p > maxi or toCat == '':
                    maxi = p
                    toCat = cat

            cat = fin.readline().strip()
            if cat in stat.cats:
                allCat[cat] += 1
                callAll[toCat] += 1
                if toCat == cat:
                    callBack[cat] += 1
                    correct += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat +
                          '  Yes')
                else:
                    wrong += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat +
                          '  No')

    print('\nTotal Precision:  correct / total = %d / %d' %
          (correct, correct + wrong))
    for cat in allCat:
        print('[' + cat + ']')
        if callAll[cat] > 0:
            p = callBack[cat] * 100.0 / callAll[cat]
        else:
            p = -1
        if allCat[cat] > 0:
            r = callBack[cat] * 100.0 / allCat[cat]
        else:
            r = -1
        print('Precision : %d / %d = %.3f%%' %
              (callBack[cat], callAll[cat], p))
        print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r))
        print('F = %.3f%%' % (2.0 * p * r / (p + r)))