def createVocabList(): global mailLists global mailIndex global spamPro global ordPro global spamNum global ordNum spamPro = 0 ordPro = 0 spamDict = {} ordDict = {} spamNum = 0 ordNum = 0 for n in range(len(mailLists)): wordInMail = mecab.MeCabClass(mailLists[n]) NounWords = wordInMail.GetNoun2() AdjWords = wordInMail.GetAdj() if mailIndex[n] == 1: for noun in NounWords: ordNum = ordNum + 1 addWordToDict(noun,ordDict) for adj in AdjWords: ordNum = ordNum + 1 addWordToDict(adj,ordDict) if mailIndex[n] == 0: for noun in NounWords: spamNum = spamNum + 1 addWordToDict(noun,spamDict) for adj in AdjWords: spamNum = spamNum + 1 addWordToDict(adj,spamDict) spamPro = float(spamNum/(spamNum + ordNum)) ordPro = float(ordNum/(spamNum + ordNum)) return spamDict,ordDict
def speechStats1(): global TwoCooccurFile global ThreeCooccurFile # TwoWordList = Tools.content_wordList(TwoCooccurFile,':') TwoWordList = Tools.content_wordList(ThreeCooccurFile, ':') allSpeech = set() allSpeechTuple = {} for word in TwoWordList: thisWordMec = mecab.MeCabClass(word) thisSpeechs = thisWordMec.GetAll3() try: thisSpeech = thisSpeechs[0] for n in range(len(thisSpeechs) - 1): thisSpeech += "+" + thisSpeechs[n + 1] allSpeech.add(thisSpeech) except: pass allSpeechTuple.setdefault(thisSpeech, []).append(word) SpeechCntTuple = {} for speech in allSpeechTuple.keys(): SpeechCntTuple[speech] = len(allSpeechTuple[speech]) sortedList = Tools.TupleSort(SpeechCntTuple) Speech = [] SpeechCnt = [] Cnt = 0 for ele in sortedList: Cnt += 1 print Cnt, ':', ele[0], ':\t', ele[1] Speech.append(Cnt) SpeechCnt.append(ele[1]) Tools.plot(Speech, SpeechCnt)
def DuplicateRemoval(filename): fopen = open(filename, 'r') content = fopen.read() oneWordList = content.split('\n') allSpeech = set() allSpeechTuple = {} wordTuple = {} for oneWord in oneWordList: # thisWord = oneWord.split(':')[0] thisWord = oneWord try: if thisWord in wordTuple.keys(): wordTuple[thisWord] += int( oneWord.split(':')[1].split(' ')[-1]) else: wordTuple[thisWord] = int(oneWord.split(':')[1].split(' ')[-1]) except: traceback.print_exception() newfilename = filename[:-4] + "_DuplicateRemoval.txt" newfile = open(newfilename, 'wb') wordList = Tools.TupleSort(wordTuple) for word in wordList: # print word thisWordMec = mecab.MeCabClass(word[0]) thisSpeechs = thisWordMec.GetAll() newfile.write(word[0] + ':') for subword in thisSpeechs: newfile.write(subword + ' ') newfile.write(str(wordTuple[word[0]]) + '\n') newfile.close()
def wordCounter(self): if self.counter != 0: return self.counter else: for file in self.fileLists: fopen = open(file, 'r') content = fopen.read() nounInCon = mecab.MeCabClass(content).GetNoun2() self.counter += len(nounInCon) return self.counter
def speech(filename, newfile, threshold): oneWordList = Tools.content_wordList(filename, ':') wordListFre = Tools.content_scoreList(filename, [':', ' ']) # print len(oneWordList),len(WordsFreList) allSpeech = set() allSpeechTuple = {} SpeechWordsFreTup = {} for n in range(len(oneWordList)): # try: # thisWord = oneWord.split(':')[0] thisWord = oneWordList[n] thisWordMec = mecab.MeCabClass(thisWord) thisSpeechs = thisWordMec.GetAll3() try: thisSpeech = thisSpeechs[0] for x in range(len(thisSpeechs) - 1): thisSpeech += "+" + thisSpeechs[x + 1] allSpeech.add(thisSpeech) except: pass # if thisWord not in allSpeechTuple[thisSpeech]: allSpeechTuple.setdefault(thisSpeech, []).append(thisWord) # try: if thisSpeech in SpeechWordsFreTup.keys(): SpeechWordsFreTup[thisSpeech] += wordListFre[n] else: SpeechWordsFreTup[thisSpeech] = wordListFre[n] newfile = open(newfile, 'wb') for speech in allSpeechTuple.keys(): if SpeechWordsFreTup[speech] > threshold: newfile.write(speech + ':' + str(SpeechWordsFreTup[speech]) + ':\n') if SpeechWordsFreTup[speech] > 300: for n in range(300): try: x = random.randint(0, len(allSpeechTuple) - 1) newfile.write(allSpeechTuple[speech][x] + ' ') except: pass continue else: for word in allSpeechTuple[speech]: newfile.write(word + ' ') newfile.write('\n\n') newfile.close() # newfile2 = open('allSpeechs2.txt','wb') # for a in allSpeech: # try: # newfile2.write(a+' '+str(len(allSpeechTuple[a]))+'\n') # except: # print a # newfile2.close() print len(allSpeech)
def test(filename): global count fopen = open(filename) content = fopen.read() words = content.split('_') for word in words: mec = mecab.MeCabClass(word) allw = mec.GetNoun2() if len(allw) > 3: count = count + 1
def chooseUse(filename, recordlist, recordfile): global useCount global uselessCount item = Item() record = Record() fopen = open(filename, 'r') content = fopen.read()[:-9] paras = content.split("<b>○") fopen.close() for n in range(len(paras) - 1): para_split = paras[n + 1].split("</b>") #para_split[0]:speaker #para_split[1]:speaking item.speaker = para_split[0] item.content = re.sub('<br>', '', para_split[1]) exit = '' if len(item.content) < 300: thisMecab = mecab.MeCabClass(item.content) print item.content + " " + str(len(item.content)) print "Useful:", str(useCount), "Useless:", str( uselessCount), "all is", str(useCount + uselessCount), '\n' input = raw_input("y/n/e:") while (input != "y" or input != "n"): if input == "y": record.length = len(item.content) record.bool = 1 recordlist.append([record.length, record.bool]) recordfile.write(str(record.bool)) recordfile.write(":") recordfile.write(item.content) recordfile.write(">\n") useCount += 1 break elif input == "n": record.length = len(item.content) record.bool = 0 recordfile.write(str(record.bool)) recordfile.write(":") recordfile.write(item.content) recordfile.write(">\n") uselessCount = uselessCount + 1 break elif input == 'e': print "you really want to exit it?:(y/n)" exit = raw_input("y/n:") if exit == "y": return 1 else: print "Please input again:(y/n/e):" input = raw_input("y/n:") print "\n\n\n"
def nounExtractFrom2000(fromFile,toFile): # aimFilePath = "/home/dream/documents/papers/code/datatest/to2000Noun/" fopen = open(fromFile,'r') newfile = open(toFile,'wb') text = fopen.read() textMecab = mecab.MeCabClass(text) allWordLists = textMecab.GetAll2() for n in range(len(allWordLists)): if textMecab.isNoun2(allWordLists[n][1]): newfile.write(allWordLists[n][0]) if n != len(allWordLists) - 1: if textMecab.isNoun2(allWordLists[n+1][1]) == 0: newfile.write("_") fopen.close() newfile.close()
def speechStats2(): global TwoCooccurFile global ThreeCooccurFile # lines = Tools.content_lines(TwoCooccurFile) wordListFre = Tools.content_scoreList(TwoCooccurFile, [':', ' ']) wordList = Tools.content_wordList(TwoCooccurFile, ':') allSpeech = set() allSpeechTuple = {} allSpeechTuple2 = {} for n in range(len(wordList) - 1): # print wordList[n],wordListFre[n] thisWordMec = mecab.MeCabClass(wordList[n]) thisSpeechs = thisWordMec.GetAll3() try: thisSpeech = thisSpeechs[0] except: pass for x in range(len(thisSpeechs) - 1): thisSpeech += "+" + thisSpeechs[x + 1] try: if thisSpeech in allSpeechTuple.keys(): allSpeechTuple2[thisSpeech] += wordListFre[n] allSpeechTuple[thisSpeech] += 1 else: allSpeechTuple2[thisSpeech] = wordListFre[n] allSpeechTuple[thisSpeech] = 1 except: print n, wordList[n] sortedList = Tools.TupleSort(allSpeechTuple2) Speech = [] SpeechCnt = [] Cnt = 0 for ele in sortedList: Cnt += 1 # print Cnt,':',ele[0],' ',ele[1] # print ele[0],'\t',ele[1] Speech.append(Cnt) SpeechCnt.append(ele[1]) title = "Part_of_speech-frequence distribution" Tools.plot(Speech, SpeechCnt, title, xlabel="Part of speech", ylabel="frequence")
def getWordBox(self): runCounter = 0 for file in self.fileLists: runCounter += 1 if runCounter % 200 == 0: os.system('clear') print "This is the", runCounter, "th lines" fopen = open(file, 'r') content = fopen.read() paras = content.split('>>>') for para in paras: nounInCon = mecab.MeCabClass(para.split('::')[-1]).GetNoun2() self.counter += len(nounInCon) for noun in nounInCon: if noun not in self.wordBox: self.wordBox[noun] = 1 else: self.wordBox[noun] += 1 return self.wordBox
def nounExtractFromPretreat(filename,newfilename): fopen = open(filename,'r') text = fopen.read() contents = text.split('>>>') newfile = open(newfilename,'wb') for content in contents: speaking = content.split('::')[-1] paras = speaking.split('\n') for para in paras: textMecab = mecab.MeCabClass(para) allWordLists = textMecab.GetAll2() for n in range(len(allWordLists)): if textMecab.isNoun2(allWordLists[n][1]): newfile.write(allWordLists[n][0]) if n != len(allWordLists) - 1: if textMecab.isNoun2(allWordLists[n+1][1]) == 0: newfile.write("_") newfile.write("\n") newfile.close() fopen.close()
def isSpamMail(mail,spamDict,ordDict): global spamPro global ordPro global spamNum global ordNum wordBox = {} wordsNum = 0 wordInMail = mecab.MeCabClass(mail) NounWords = wordInMail.GetNoun2() AdjWords = wordInMail.GetAdj() for noun in NounWords: addWordToDict(noun,wordBox) wordsNum = wordsNum + 1 for adj in AdjWords: addWordToDict(adj,wordBox) wordsNum = wordsNum + 1 spam = 1 ordinary = 1 #begin to get probility of per features print len(wordBox) for key in wordBox: if key not in spamDict: spam = spam * (0.5/spamNum)**(10/len(wordBox)) else: spam = spam * (spamDict[key]/spamNum)**(10/len(wordBox)) if key not in ordDict: ordinary = ordinary * (0.5/ordNum) **(10/len(wordBox)) else: ordinary = ordinary * (ordDict[key] /ordNum) **(10/len(wordBox)) spam = spamPro*spam ordinary = ordPro*ordinary print "spam: ",spam print "ordinary: ",ordinary if spam > ordinary: return True else: return False
def main1(tofile): lines = Tools.content_lines("/home/dreamer/documents/code/eliminate/test/test_dict_for_singleWord.txt") wordDictTuple = {} for l in lines: if l != '': spl = l.split(',') wordDictTuple[spl[1]] = spl[2][2:-2] lines = Tools.content_lines("/home/dreamer/documents/code/eliminate/test/test.txt") to = open(tofile,'wb') for line in lines[:-1]: try: a = mecab.MeCabClass(line).GetAll() to.write(a[0]+','+a[1]+',') if a[0] not in wordDictTuple.keys() and a[1] not in wordDictTuple.keys(): to.write('\n') elif a[0] not in wordDictTuple.keys(): to.write(','+str(wordDictTuple[a[1]])+'\n') elif a[1] not in wordDictTuple.keys(): to.write(str(wordDictTuple[a[0]])+','+'\n') else: to.write(str(wordDictTuple[a[0]])+','+str(wordDictTuple[a[1]])+'\n') except: print line to.close()
def generateMecabList(allWordFileName, topath): allWordFile = open(allWordFileName, 'r') content = allWordFile.read() lines = content.split('\n') OneFile = open(topath + 'OneWordsAdjacency.txt', 'wb') TwoFile = open(topath + 'TwoWordsAdjacency.txt', 'wb') ThreeFile = open(topath + 'ThreeWordsAdjacency.txt', 'wb') MultiFile = open(topath + 'MultiWordsAdjacency.txt', 'wb') AllFile = open(topath + 'AllWordsAdjacency.txt', 'wb') runCounter = 0 for line in lines: runCounter += 1 if runCounter % 2000 == 0: os.system('clear') print "This is the", runCounter, "th lines" try: lineEle = line.split('\t') except: continue word = lineEle[0] try: wordFreStr = lineEle[1] except: continue wordFre = int(wordFreStr) if wordFre > 0: mec = mecab.MeCabClass(word) wordbox = mec.GetAll() if len(wordbox) == 1: OneFile.write(word) AllFile.write(word) OneFile.write(":") AllFile.write(":") OneFile.write(wordbox[0] + " " + wordFreStr + '\n') AllFile.write(wordbox[0] + " " + wordFreStr + '\n') elif len(wordbox) == 2: TwoFile.write(word) TwoFile.write(":") TwoFile.write(wordbox[0] + ' ' + wordbox[1] + " " + wordFreStr + '\n') AllFile.write(word) AllFile.write(":") AllFile.write(wordbox[0] + ' ' + wordbox[1] + " " + wordFreStr + '\n') elif len(wordbox) == 3: ThreeFile.write(word) ThreeFile.write(":") ThreeFile.write(wordbox[0] + ' ' + wordbox[1] + ' ' + wordbox[2] + " " + wordFreStr + '\n') AllFile.write(word) AllFile.write(":") AllFile.write(wordbox[0] + ' ' + wordbox[1] + ' ' + wordbox[2] + " " + wordFreStr + '\n') elif len(wordbox) > 3: MultiFile.write(word) MultiFile.write(":") AllFile.write(word) AllFile.write(":") for index in range(len(wordbox) - 1): MultiFile.write(wordbox[index] + ' ') AllFile.write(wordbox[index] + ' ') MultiFile.write(wordbox[-1] + " " + wordFreStr) MultiFile.write('\n') AllFile.write(wordbox[-1] + " " + wordFreStr) AllFile.write('\n') OneFile.close() TwoFile.close() ThreeFile.close() MultiFile.close() AllFile.close()
def divideIntoUseAndUnuse(TwoWordsMecabList, ThreeWordsMecabList, topath): TwoWordsMecabListFile = open(TwoWordsMecabList, 'r') ThreeWordsMecabListFile = open(ThreeWordsMecabList, 'r') # MultiWordsMecabListFile = open(MultiWordsMecabList, 'r') TwousefulSpeechList = Tools.content_wordList( "/home/dreamer/documents/code/database/condition/SpeechUseful2.txt", '\t') ThreeusefulSpeechList = Tools.content_wordList( "/home/dreamer/documents/code/database/condition/SpeechUseful3.txt", ' \t') TwoUsefulFileName = topath + "TwoWordsAdjacency_afterSpeech.txt" ThreeUsefulFile = topath + "ThreeWordsAdjacency_afterSpeech.txt" TwoUsefulFile = open(TwoUsefulFileName, 'wb') ThreeUsefulFile = open(ThreeUsefulFile, 'wb') TwoWordsContent = TwoWordsMecabListFile.read() TwoWordSet = set() oneWordList = TwoWordsContent.split('\n') allSpeechTuple2 = {} for oneWord in oneWordList: if oneWord == '': continue thisWord = oneWord.split(':')[0] TwoWordSet.add(thisWord) thisWordMec = mecab.MeCabClass(thisWord) thisSpeechs = thisWordMec.GetAll3() try: thisSpeech = thisSpeechs[0] for n in range(len(thisSpeechs) - 1): thisSpeech += "+" + thisSpeechs[n + 1] if thisSpeech in TwousefulSpeechList: TwoUsefulFile.write(oneWord + '\n') except: pass ThreeWordsContent = ThreeWordsMecabListFile.read() WordList3 = ThreeWordsContent.split('\n') allSpeechTuple3 = {} for oneWord in WordList3: if oneWord == '': continue thisWord = oneWord.split(':')[0] thisWordMec = mecab.MeCabClass(thisWord) thisSpeechs = thisWordMec.GetAll3() try: thisSpeech = thisSpeechs[0] for n in range(len(thisSpeechs) - 1): thisSpeech += "+" + thisSpeechs[n + 1] if thisSpeech in ThreeusefulSpeechList: ThreeUsefulFile.write(oneWord + '\n') else: pass except: traceback.print_exc() continue TwoWordsMecabListFile.close() ThreeWordsMecabListFile.close() TwoUsefulFile.close() ThreeUsefulFile.close()
#!/usr/bin/python # -*- coding: utf-8 -*- import mecab text = "お隣中国" # mecab.MeCabClassForText(text) test = mecab.MeCabClass(text) # a = test.GetAll3() # print b # for b in a: print test.PrintAll() # 1. print # print test.PrintNoun2AndMark() # print test.PrintNoun2() # 2. get (retur n list) # VerbNodeList = test.GetAdj() # mydict = dict() # for node in VerbNodeList: # if node not in mydict: # mydict[node] = 1 # else: # mydict[node] += 1
def GetConnPro(): global TwoCooccurFile global ThreeCooccurFile TwoWordList = Tools.content_wordList(TwoCooccurFile, ':') TwoWordCntlist = Tools.content_scoreList(TwoCooccurFile, [':', ' ']) # print len(AllWordList),len(AllWordCntlist),len(TwoWordList),len(TwoWordCntlist) # print AllWordList[-1],AllWordCntlist[-1],TwoWordList[-1],TwoWordCntlist[-1] TwoWordCnt = 0 for Cnt in TwoWordCntlist: TwoWordCnt += Cnt OneWordSpeechTuple = {} TwoWordSpeechTuple = {} # print key for n in range(len(TwoWordList)): thisWordMec = mecab.MeCabClass(TwoWordList[n]) thisSpeechs = thisWordMec.GetAll3() thisSpeech = thisSpeechs[0] + "+" + thisSpeechs[1] if thisSpeechs[0] in OneWordSpeechTuple: OneWordSpeechTuple[thisSpeechs[0]] += TwoWordCntlist[n] else: OneWordSpeechTuple[thisSpeechs[0]] = TwoWordCntlist[n] if thisSpeechs[1] in OneWordSpeechTuple: OneWordSpeechTuple[thisSpeechs[1]] += TwoWordCntlist[n] else: OneWordSpeechTuple[thisSpeechs[1]] = TwoWordCntlist[n] if thisSpeech in TwoWordSpeechTuple: TwoWordSpeechTuple[thisSpeech] += TwoWordCntlist[n] else: TwoWordSpeechTuple[thisSpeech] = TwoWordCntlist[n] # for key in TwoWordSpeechTuple.keys(): # # TwoWordSpeechTuple[key] = float(TwoWordSpeechTuple[key])/float(TwoWordCnt) # print key,TwoWordSpeechTuple[key] # for key in OneWordSpeechTuple.keys(): # # OneWordSpeechTuple[key] = float(OneWordSpeechTuple[key])/float(TwoWordCnt*2) # print key,OneWordSpeechTuple[key] TwoWordConnTuple = {} for key in TwoWordSpeechTuple.keys(): try: a = key.split('+')[0] b = key.split('+')[1] except: print key try: c1 = OneWordSpeechTuple[a] c2 = OneWordSpeechTuple[b] except: return c12 = TwoWordSpeechTuple[key] p = float(c2) / float(TwoWordCnt * 2) p1 = float(c12) / float(c1) p2 = float((c2 - c12)) / float((TwoWordCnt * 2 - c1)) try: score = math.log(binomial(c12,c1,p)) + math.log(binomial(c2-c12,TwoWordCnt*2-c1,p)) \ - math.log(binomial(c12,c1,p1)) - math.log(binomial(c2-c12,TwoWordCnt*2-c1,p2)) # TwoWordConnTuple[key].write(wholeWord+'\t:'+subword1+' '+str(c1)+'\t:'+subword2+' '+str(c2)+'\t:'+str(c12)+'\t:'+str(score)+'\n') TwoWordConnTuple[key] = score except: # traceback.print_exc() # TwoScoreRecord.write(wholeWord+'\t:'+subword1+' '+str(c1)+'\t:'+subword2+' '+str(c2)+'\t:'+str(c12)+'\t:'+str(-9999)+'\n') TwoWordConnTuple[key] = -9999 TwoWordConnList = Tools.TupleSort(TwoWordConnTuple) Speech = [] SpeechCnt = [] Cnt = 0 for ele in TwoWordConnList: Cnt += 1 print Cnt, ':', ele[0], ele[1], TwoWordSpeechTuple[ele[0]] Speech.append(Cnt) SpeechCnt.append(ele[1]) Tools.plot(Speech, SpeechCnt)