Beispiel #1
0
def parse(stat, path='mirror/',  n_news=10000):
    stopWord = StopWord.getStopWord()
    print(str(stopWord))
    lastDoc = []
    for number in range(1, n_news+1):
        filename = path + str(number) + '.txt'
        with open(filename, 'rb') as fin:
            if fin:
                s = fin.readline()    # title
                print(number, s)
                s = fin.readline()    # body
                termList = re.split('[^a-zA-Z]+', s)
                pass
                s = fin.readline()    # category
                if s in stat.cats:
                    for item in termList:
                        item = item.lower()
                        if not ((item in stopWord) or (len(item) == 1)):
                            stat.catTermAmount[stat.cats[s]] += 1
                            if not (item in stat.terms):
                                stat.termToInt[item] = len(stat.terms)
                                stat.terms.append(item)
                                stat.termInDoc.append(0)
                                stat.termAmount.append(0)
                                lastDoc.append(-1)
                            stat.totalTerm += 1
                            no = stat.termToInt[item]
                            if lastDoc[no] != number:
                                lastDoc[no] = number
                                stat.termInDoc[no] += 1
                            stat.termAmount[no] += 1
                            stat.termInCat[stat.cats[s]][no] += 1
Beispiel #2
0
def parse(stat, path='mirror/', n_news=10000):
    stopWord = StopWord.getStopWord()
    print(str(stopWord))
    lastDoc = []
    for number in range(1, n_news + 1):
        filename = path + str(number) + '.txt'
        with open(filename, 'rb') as fin:
            if fin:
                s = fin.readline()  # title
                print(number, s)
                s = fin.readline()  # body
                termList = re.split('[^a-zA-Z]+', s)
                pass
                s = fin.readline()  # category
                if s in stat.cats:
                    for item in termList:
                        item = item.lower()
                        if not ((item in stopWord) or (len(item) == 1)):
                            stat.catTermAmount[stat.cats[s]] += 1
                            if not (item in stat.terms):
                                stat.termToInt[item] = len(stat.terms)
                                stat.terms.append(item)
                                stat.termInDoc.append(0)
                                stat.termAmount.append(0)
                                lastDoc.append(-1)
                            stat.totalTerm += 1
                            no = stat.termToInt[item]
                            if lastDoc[no] != number:
                                lastDoc[no] = number
                                stat.termInDoc[no] += 1
                            stat.termAmount[no] += 1
                            stat.termInCat[stat.cats[s]][no] += 1
Beispiel #3
0
 def __init__(self, datarecord, colnnames, delimtr):
     self._flogger()
     self.datarecord = datarecord
     self.df = pd.read_csv(self.datarecord,
                           names=colnnames,
                           delimiter=delimtr)
     self.stopwords = StopWord.EnglishStopWord().stopwords()
Beispiel #4
0
def test(stat, path='', n_test=10):
    allCat = {'Crime and law': 0, 'Culture and entertainment': 0, 'Disasters and accidents': 0,
              'Science and technology': 0, 'Health': 0}
    callBack = dict(allCat)
    callAll = dict(allCat)
    stopWord = StopWord.getStopWord()
    termSum = len(stat.terms)
    correct = 0
    wrong = 0
    for n in range(1, n_test+1):
        filename = path + str(n) + '.txt'

        with open(filename, 'rb') as fin:
            title = fin.readline().strip()
            termList = re.split('[^a-zA-Z]+', fin.readline())
            maxi = 0
            toCat = ''

            for cat in stat.cats:   #
                noC = stat.cats[cat]
                p = 0.0
                for t in termList:
                    t = t.lower()
                    if not (t in stopWord or len(t) == 1):
                        if t in stat.terms:
                            noT = stat.termToInt[t]
                            p += math.log(1.0 * (stat.termInCat[noC][noT] + 1) / (stat.catTermAmount[noC] + termSum))
                p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) / stat.totalTerm)
                if p > maxi or toCat == '':
                    maxi = p
                    toCat = cat

            cat = fin.readline().strip()
            if cat in stat.cats:
                allCat[cat] += 1
                callAll[toCat] += 1
                if toCat == cat:
                    callBack[cat] += 1
                    correct += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat + '  Yes')
                else:
                    wrong += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat + '  No')

    print('\nTotal Precision:  correct / total = %d / %d' % (correct, correct + wrong))
    for cat in allCat:
        print('[' + cat + ']')
        if callAll[cat] > 0:
            p = callBack[cat] * 100.0 / callAll[cat]
        else:
            p = -1
        if allCat[cat] > 0:
            r = callBack[cat] * 100.0 / allCat[cat]
        else:
            r = -1
        print('Precision : %d / %d = %.3f%%' % (callBack[cat], callAll[cat], p))
        print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r))
        print('F = %.3f%%' % (2.0 * p * r / (p + r)))
Beispiel #5
0
    def __init__(self, textcolname):

        self._flogger()
        self.corpus = MmCorpus(PConstant.CORPUS_DIR_PATH.value + textcolname +
                               '_corpus.mm')
        self.dictionary = Dictionary.load(PConstant.DICTIONARY_DIR_PATH.value +
                                          textcolname + '_dictionary.dict')
        self.lda = models.LdaModel.load(PConstant.LDA_DIR_PATH.value +
                                        textcolname + '_lda.model')
        self.stopwords = StopWord.EnglishStopWord().stopwords()
def calc_freq_word():
    stopwords = StopWord.stop_word()
    word_tokens = Word_Tokenization.word_tokenization()
    word_frequencies = {}
    for word in word_tokens:
        if word not in stopwords:
            if word not in word_frequencies:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
                
    return word_frequencies
Beispiel #7
0
def seg_line(line):
    '''
        给每一行的文本分词
        '''
    line = line.rstrip()  #去掉每一行的换行符
    words = segmentor.segment(line)  #分词
    postags = postagger.postag(words)  #词性标注
    Pos_Filter = PosFilter(words, postags)  #新建一个词性过滤器对指定的词性的单词进行过滤
    words_filter = Pos_Filter.filter_words()  #过滤单词
    rm_stop_word = StopWord.filter_words(words_filter)  #去掉停用词
    join_word = [ele for ele in rm_stop_word if not ele.isalpha()]
    join_word = ' '.join(join_word)  #用空格连接分词结果
    return join_word
Beispiel #8
0
def seg_line(line, segmentor, postagger):
    '''
        给每一句话的处理
        '''
    line = line.rstrip()  #去掉每一行的换行符
    words = segmentor.segment(line)  #分词
    postags = postagger.postag(words)  #词性标注
    Pos_Filter = PosFilter(words, postags)  #新建一个词性过滤器对指定的词性的单词进行过滤
    words_filter = Pos_Filter.filter_words()  #过滤单词
    rm_stop_word = StopWord.filter_words(words_filter)  #去掉停用词
    join_word = [ele for ele in rm_stop_word]  #去掉英文单词
    #join_word = [ele for ele in join_word if len(ele)>3] #去掉英文单词
    return join_word
Beispiel #9
0
# if we find an and and are using boolean feature
#todo modified this if statement
#print("query we are looking at: " + str(query))
for compword in compwords:
    if compword in query and parameters.use_booleanSearch:
        booleanSearch.constructList(collection, query)
        parameters.use_blindRelevance = False
        ranBooleanResults = True

# create accumulators and other data structures
accum = {}
filenames = []
tfidfterms = {}
p = porter.PorterStemmer()
sw = StopWord.StopWord()
t = thesaurus.Thesaurus()
tfidf = tf_idf.tfidf()

# get N
f = open(collection + "_index_N", "r")
N = eval(f.read())
f.close()

# get document lengths/titles
titles = {}
f = open(collection + "_index_len", "r")
lengths = f.readlines()  #an array of all the file titles and their lengths
f.close()

titleScore = 0
Beispiel #10
0
def test(stat, path='', n_test=10):
    allCat = {
        'Crime and law': 0,
        'Culture and entertainment': 0,
        'Disasters and accidents': 0,
        'Science and technology': 0,
        'Health': 0
    }
    callBack = dict(allCat)
    callAll = dict(allCat)
    stopWord = StopWord.getStopWord()
    termSum = len(stat.terms)
    correct = 0
    wrong = 0
    for n in range(1, n_test + 1):
        filename = path + str(n) + '.txt'

        with open(filename, 'rb') as fin:
            title = fin.readline().strip()
            termList = re.split('[^a-zA-Z]+', fin.readline())
            maxi = 0
            toCat = ''

            for cat in stat.cats:  #
                noC = stat.cats[cat]
                p = 0.0
                for t in termList:
                    t = t.lower()
                    if not (t in stopWord or len(t) == 1):
                        if t in stat.terms:
                            noT = stat.termToInt[t]
                            p += math.log(1.0 *
                                          (stat.termInCat[noC][noT] + 1) /
                                          (stat.catTermAmount[noC] + termSum))
                p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) /
                              stat.totalTerm)
                if p > maxi or toCat == '':
                    maxi = p
                    toCat = cat

            cat = fin.readline().strip()
            if cat in stat.cats:
                allCat[cat] += 1
                callAll[toCat] += 1
                if toCat == cat:
                    callBack[cat] += 1
                    correct += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat +
                          '  Yes')
                else:
                    wrong += 1
                    print(title + '  :  ' + cat + '   toCat: ' + toCat +
                          '  No')

    print('\nTotal Precision:  correct / total = %d / %d' %
          (correct, correct + wrong))
    for cat in allCat:
        print('[' + cat + ']')
        if callAll[cat] > 0:
            p = callBack[cat] * 100.0 / callAll[cat]
        else:
            p = -1
        if allCat[cat] > 0:
            r = callBack[cat] * 100.0 / allCat[cat]
        else:
            r = -1
        print('Precision : %d / %d = %.3f%%' %
              (callBack[cat], callAll[cat], p))
        print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r))
        print('F = %.3f%%' % (2.0 * p * r / (p + r)))
Beispiel #11
0
def main(filename, stopWord, vocabSize, useWord, reducedDim, opt, sampleSize,
         n_class, kernel, vecName, gs):

    Morphs_flag = False
    VerbExtraction_flag = False
    StopWord_flag = False
    CreateVector_flag = False
    SVM_flag = False
    SVM_w2v_flag = True
    SVM_test_flag = False

    if Morphs_flag:
        # filename.tsv -> planeText.txt
        Morphs.PlaneText(filename)
        print "PlaneText has done."

        # planeText.txt -> filename.cabocha
        Morphs.Dependency(filename)
        print "Dependency has done."

    if VerbExtraction_flag:
        # filename.cabocha -> trainingData.csv
        VerbExtraction.extractVerb(filename)
        print "extractVerb has done."

        # trainingData.csv -> verbDict.json
        VerbExtraction.createVerbDict(filename, verbDictSize=100)
        print "createVerbDict has done."

    if StopWord_flag:
        # # dependency.cabocha (trainingData.csv) -> allWordDict.json
        # StopWord.extractAllWord(filename)
        # print "extractAllWord has done."

        # allWordDict.json -> stopList.txt
        StopWord.dictToStopwordList(filename, k=stopWord)
        print "dictToStopwordList has done."

        # allWordDict.json -> vocabDict.json
        StopWord.createVocabDict(filename, k=stopWord, vocabSize=vocabSize)
        print "createVocabDict has done."

        # StopWord.showDict(filename, dictName="verbDict", k=100, showAll=False)

    if CreateVector_flag:
        start = time.time()
        # trainingData.csv(+vocabDict.json +verbDict.json) -> vectorData.csv
        if opt == "equal":
            CreateVector.CreateVector(filename,
                                      useWord,
                                      verbDictSize=100,
                                      removeVerb=10)
        elif opt == "hist":
            CreateVector.CreateVector_Hist(filename,
                                           useWord,
                                           verbDictSize=100,
                                           removeVerb=10)
        elif opt == "hist_all":
            CreateVector.CreateVector_Hist_fullword(filename,
                                                    useWord,
                                                    verbDictSize=100,
                                                    removeVerb=10)
        else:
            print "what is option?"
        elapsed_time = time.time() - start
        print "CreateVector has done : %s [min]" % (elapsed_time / 60)

    if SVM_flag:
        start = time.time()
        # vectorData.csv -> classify by SVM
        SVM.main(filename,
                 n_folds=5,
                 useWord=useWord,
                 sampleSize=sampleSize,
                 reducedDim=reducedDim,
                 opt=opt,
                 n_class=n_class,
                 kernel=kernel,
                 vecName=vecName,
                 gridsearch=gs)
        elapsed_time = time.time() - start
        print "SVM has done : %s [min]" % (elapsed_time / 60)

    if SVM_w2v_flag:
        start = time.time()
        # vectorData.csv -> classify by SVM
        SVM.w2vSVM(filename,
                   n_folds=5,
                   useWord=useWord,
                   sampleSize=sampleSize,
                   opt=opt,
                   n_class=n_class,
                   kernel=kernel)
        elapsed_time = time.time() - start
        print "SVM has done : %s [min]" % (elapsed_time / 60)

    if SVM_test_flag:
        start = time.time()
        # vectorData.csv -> classify by SVM
        SVM.test(filename, useWord, sampleSize, reducedDim, opt, n_class,
                 kernel)
        elapsed_time = time.time() - start
        print "SVM has done : %s [min]" % (elapsed_time / 60)