def parse(stat, path='mirror/', n_news=10000): stopWord = StopWord.getStopWord() print(str(stopWord)) lastDoc = [] for number in range(1, n_news+1): filename = path + str(number) + '.txt' with open(filename, 'rb') as fin: if fin: s = fin.readline() # title print(number, s) s = fin.readline() # body termList = re.split('[^a-zA-Z]+', s) pass s = fin.readline() # category if s in stat.cats: for item in termList: item = item.lower() if not ((item in stopWord) or (len(item) == 1)): stat.catTermAmount[stat.cats[s]] += 1 if not (item in stat.terms): stat.termToInt[item] = len(stat.terms) stat.terms.append(item) stat.termInDoc.append(0) stat.termAmount.append(0) lastDoc.append(-1) stat.totalTerm += 1 no = stat.termToInt[item] if lastDoc[no] != number: lastDoc[no] = number stat.termInDoc[no] += 1 stat.termAmount[no] += 1 stat.termInCat[stat.cats[s]][no] += 1
def parse(stat, path='mirror/', n_news=10000): stopWord = StopWord.getStopWord() print(str(stopWord)) lastDoc = [] for number in range(1, n_news + 1): filename = path + str(number) + '.txt' with open(filename, 'rb') as fin: if fin: s = fin.readline() # title print(number, s) s = fin.readline() # body termList = re.split('[^a-zA-Z]+', s) pass s = fin.readline() # category if s in stat.cats: for item in termList: item = item.lower() if not ((item in stopWord) or (len(item) == 1)): stat.catTermAmount[stat.cats[s]] += 1 if not (item in stat.terms): stat.termToInt[item] = len(stat.terms) stat.terms.append(item) stat.termInDoc.append(0) stat.termAmount.append(0) lastDoc.append(-1) stat.totalTerm += 1 no = stat.termToInt[item] if lastDoc[no] != number: lastDoc[no] = number stat.termInDoc[no] += 1 stat.termAmount[no] += 1 stat.termInCat[stat.cats[s]][no] += 1
def __init__(self, datarecord, colnnames, delimtr): self._flogger() self.datarecord = datarecord self.df = pd.read_csv(self.datarecord, names=colnnames, delimiter=delimtr) self.stopwords = StopWord.EnglishStopWord().stopwords()
def test(stat, path='', n_test=10): allCat = {'Crime and law': 0, 'Culture and entertainment': 0, 'Disasters and accidents': 0, 'Science and technology': 0, 'Health': 0} callBack = dict(allCat) callAll = dict(allCat) stopWord = StopWord.getStopWord() termSum = len(stat.terms) correct = 0 wrong = 0 for n in range(1, n_test+1): filename = path + str(n) + '.txt' with open(filename, 'rb') as fin: title = fin.readline().strip() termList = re.split('[^a-zA-Z]+', fin.readline()) maxi = 0 toCat = '' for cat in stat.cats: # noC = stat.cats[cat] p = 0.0 for t in termList: t = t.lower() if not (t in stopWord or len(t) == 1): if t in stat.terms: noT = stat.termToInt[t] p += math.log(1.0 * (stat.termInCat[noC][noT] + 1) / (stat.catTermAmount[noC] + termSum)) p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) / stat.totalTerm) if p > maxi or toCat == '': maxi = p toCat = cat cat = fin.readline().strip() if cat in stat.cats: allCat[cat] += 1 callAll[toCat] += 1 if toCat == cat: callBack[cat] += 1 correct += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' Yes') else: wrong += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' No') print('\nTotal Precision: correct / total = %d / %d' % (correct, correct + wrong)) for cat in allCat: print('[' + cat + ']') if callAll[cat] > 0: p = callBack[cat] * 100.0 / callAll[cat] else: p = -1 if allCat[cat] > 0: r = callBack[cat] * 100.0 / allCat[cat] else: r = -1 print('Precision : %d / %d = %.3f%%' % (callBack[cat], callAll[cat], p)) print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r)) print('F = %.3f%%' % (2.0 * p * r / (p + r)))
def __init__(self, textcolname): self._flogger() self.corpus = MmCorpus(PConstant.CORPUS_DIR_PATH.value + textcolname + '_corpus.mm') self.dictionary = Dictionary.load(PConstant.DICTIONARY_DIR_PATH.value + textcolname + '_dictionary.dict') self.lda = models.LdaModel.load(PConstant.LDA_DIR_PATH.value + textcolname + '_lda.model') self.stopwords = StopWord.EnglishStopWord().stopwords()
def calc_freq_word(): stopwords = StopWord.stop_word() word_tokens = Word_Tokenization.word_tokenization() word_frequencies = {} for word in word_tokens: if word not in stopwords: if word not in word_frequencies: word_frequencies[word] = 1 else: word_frequencies[word] += 1 return word_frequencies
def seg_line(line): ''' 给每一行的文本分词 ''' line = line.rstrip() #去掉每一行的换行符 words = segmentor.segment(line) #分词 postags = postagger.postag(words) #词性标注 Pos_Filter = PosFilter(words, postags) #新建一个词性过滤器对指定的词性的单词进行过滤 words_filter = Pos_Filter.filter_words() #过滤单词 rm_stop_word = StopWord.filter_words(words_filter) #去掉停用词 join_word = [ele for ele in rm_stop_word if not ele.isalpha()] join_word = ' '.join(join_word) #用空格连接分词结果 return join_word
def seg_line(line, segmentor, postagger): ''' 给每一句话的处理 ''' line = line.rstrip() #去掉每一行的换行符 words = segmentor.segment(line) #分词 postags = postagger.postag(words) #词性标注 Pos_Filter = PosFilter(words, postags) #新建一个词性过滤器对指定的词性的单词进行过滤 words_filter = Pos_Filter.filter_words() #过滤单词 rm_stop_word = StopWord.filter_words(words_filter) #去掉停用词 join_word = [ele for ele in rm_stop_word] #去掉英文单词 #join_word = [ele for ele in join_word if len(ele)>3] #去掉英文单词 return join_word
# if we find an and and are using boolean feature #todo modified this if statement #print("query we are looking at: " + str(query)) for compword in compwords: if compword in query and parameters.use_booleanSearch: booleanSearch.constructList(collection, query) parameters.use_blindRelevance = False ranBooleanResults = True # create accumulators and other data structures accum = {} filenames = [] tfidfterms = {} p = porter.PorterStemmer() sw = StopWord.StopWord() t = thesaurus.Thesaurus() tfidf = tf_idf.tfidf() # get N f = open(collection + "_index_N", "r") N = eval(f.read()) f.close() # get document lengths/titles titles = {} f = open(collection + "_index_len", "r") lengths = f.readlines() #an array of all the file titles and their lengths f.close() titleScore = 0
def test(stat, path='', n_test=10): allCat = { 'Crime and law': 0, 'Culture and entertainment': 0, 'Disasters and accidents': 0, 'Science and technology': 0, 'Health': 0 } callBack = dict(allCat) callAll = dict(allCat) stopWord = StopWord.getStopWord() termSum = len(stat.terms) correct = 0 wrong = 0 for n in range(1, n_test + 1): filename = path + str(n) + '.txt' with open(filename, 'rb') as fin: title = fin.readline().strip() termList = re.split('[^a-zA-Z]+', fin.readline()) maxi = 0 toCat = '' for cat in stat.cats: # noC = stat.cats[cat] p = 0.0 for t in termList: t = t.lower() if not (t in stopWord or len(t) == 1): if t in stat.terms: noT = stat.termToInt[t] p += math.log(1.0 * (stat.termInCat[noC][noT] + 1) / (stat.catTermAmount[noC] + termSum)) p += math.log(1.0 * (stat.catTermAmount[noC] + 0.01) / stat.totalTerm) if p > maxi or toCat == '': maxi = p toCat = cat cat = fin.readline().strip() if cat in stat.cats: allCat[cat] += 1 callAll[toCat] += 1 if toCat == cat: callBack[cat] += 1 correct += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' Yes') else: wrong += 1 print(title + ' : ' + cat + ' toCat: ' + toCat + ' No') print('\nTotal Precision: correct / total = %d / %d' % (correct, correct + wrong)) for cat in allCat: print('[' + cat + ']') if callAll[cat] > 0: p = callBack[cat] * 100.0 / callAll[cat] else: p = -1 if allCat[cat] > 0: r = callBack[cat] * 100.0 / allCat[cat] else: r = -1 print('Precision : %d / %d = %.3f%%' % (callBack[cat], callAll[cat], p)) print('Recall : %d / %d = %.3f%%' % (callBack[cat], allCat[cat], r)) print('F = %.3f%%' % (2.0 * p * r / (p + r)))
def main(filename, stopWord, vocabSize, useWord, reducedDim, opt, sampleSize, n_class, kernel, vecName, gs): Morphs_flag = False VerbExtraction_flag = False StopWord_flag = False CreateVector_flag = False SVM_flag = False SVM_w2v_flag = True SVM_test_flag = False if Morphs_flag: # filename.tsv -> planeText.txt Morphs.PlaneText(filename) print "PlaneText has done." # planeText.txt -> filename.cabocha Morphs.Dependency(filename) print "Dependency has done." if VerbExtraction_flag: # filename.cabocha -> trainingData.csv VerbExtraction.extractVerb(filename) print "extractVerb has done." # trainingData.csv -> verbDict.json VerbExtraction.createVerbDict(filename, verbDictSize=100) print "createVerbDict has done." if StopWord_flag: # # dependency.cabocha (trainingData.csv) -> allWordDict.json # StopWord.extractAllWord(filename) # print "extractAllWord has done." # allWordDict.json -> stopList.txt StopWord.dictToStopwordList(filename, k=stopWord) print "dictToStopwordList has done." # allWordDict.json -> vocabDict.json StopWord.createVocabDict(filename, k=stopWord, vocabSize=vocabSize) print "createVocabDict has done." # StopWord.showDict(filename, dictName="verbDict", k=100, showAll=False) if CreateVector_flag: start = time.time() # trainingData.csv(+vocabDict.json +verbDict.json) -> vectorData.csv if opt == "equal": CreateVector.CreateVector(filename, useWord, verbDictSize=100, removeVerb=10) elif opt == "hist": CreateVector.CreateVector_Hist(filename, useWord, verbDictSize=100, removeVerb=10) elif opt == "hist_all": CreateVector.CreateVector_Hist_fullword(filename, useWord, verbDictSize=100, removeVerb=10) else: print "what is option?" elapsed_time = time.time() - start print "CreateVector has done : %s [min]" % (elapsed_time / 60) if SVM_flag: start = time.time() # vectorData.csv -> classify by SVM SVM.main(filename, n_folds=5, useWord=useWord, sampleSize=sampleSize, reducedDim=reducedDim, opt=opt, n_class=n_class, kernel=kernel, vecName=vecName, gridsearch=gs) elapsed_time = time.time() - start print "SVM has done : %s [min]" % (elapsed_time / 60) if SVM_w2v_flag: start = time.time() # vectorData.csv -> classify by SVM SVM.w2vSVM(filename, n_folds=5, useWord=useWord, sampleSize=sampleSize, opt=opt, n_class=n_class, kernel=kernel) elapsed_time = time.time() - start print "SVM has done : %s [min]" % (elapsed_time / 60) if SVM_test_flag: start = time.time() # vectorData.csv -> classify by SVM SVM.test(filename, useWord, sampleSize, reducedDim, opt, n_class, kernel) elapsed_time = time.time() - start print "SVM has done : %s [min]" % (elapsed_time / 60)