def createVocabList():
	global mailLists
	global mailIndex
	global spamPro
	global ordPro
	global spamNum
	global ordNum
	spamPro		= 0
	ordPro	 	= 0
	spamDict	= {}
	ordDict		= {}
	spamNum 	= 0
	ordNum		= 0
	for n in range(len(mailLists)):
		wordInMail	= mecab.MeCabClass(mailLists[n])
		NounWords	= wordInMail.GetNoun2()
		AdjWords	= wordInMail.GetAdj()
		if mailIndex[n] == 1:
			for noun in NounWords:
				ordNum		= ordNum + 1
				addWordToDict(noun,ordDict)
			for adj in AdjWords:
				ordNum		= ordNum + 1
				addWordToDict(adj,ordDict)
		if mailIndex[n] == 0:
			for noun in NounWords:
				spamNum		= spamNum + 1
				addWordToDict(noun,spamDict)
			for adj in AdjWords:
				spamNum		= spamNum + 1
				addWordToDict(adj,spamDict)

	spamPro 	= float(spamNum/(spamNum + ordNum))
	ordPro		= float(ordNum/(spamNum + ordNum))
	return spamDict,ordDict
def speechStats1():
    global TwoCooccurFile
    global ThreeCooccurFile
    # TwoWordList	 = Tools.content_wordList(TwoCooccurFile,':')
    TwoWordList = Tools.content_wordList(ThreeCooccurFile, ':')
    allSpeech = set()
    allSpeechTuple = {}
    for word in TwoWordList:
        thisWordMec = mecab.MeCabClass(word)
        thisSpeechs = thisWordMec.GetAll3()
        try:
            thisSpeech = thisSpeechs[0]
            for n in range(len(thisSpeechs) - 1):
                thisSpeech += "+" + thisSpeechs[n + 1]
                allSpeech.add(thisSpeech)
        except:
            pass
        allSpeechTuple.setdefault(thisSpeech, []).append(word)

    SpeechCntTuple = {}
    for speech in allSpeechTuple.keys():
        SpeechCntTuple[speech] = len(allSpeechTuple[speech])

    sortedList = Tools.TupleSort(SpeechCntTuple)
    Speech = []
    SpeechCnt = []

    Cnt = 0
    for ele in sortedList:
        Cnt += 1
        print Cnt, ':', ele[0], ':\t', ele[1]
        Speech.append(Cnt)
        SpeechCnt.append(ele[1])

    Tools.plot(Speech, SpeechCnt)
Beispiel #3
0
def DuplicateRemoval(filename):
    fopen = open(filename, 'r')
    content = fopen.read()
    oneWordList = content.split('\n')
    allSpeech = set()
    allSpeechTuple = {}
    wordTuple = {}
    for oneWord in oneWordList:
        # thisWord	= oneWord.split(':')[0]
        thisWord = oneWord
        try:
            if thisWord in wordTuple.keys():
                wordTuple[thisWord] += int(
                    oneWord.split(':')[1].split(' ')[-1])
            else:
                wordTuple[thisWord] = int(oneWord.split(':')[1].split(' ')[-1])
        except:
            traceback.print_exception()

    newfilename = filename[:-4] + "_DuplicateRemoval.txt"
    newfile = open(newfilename, 'wb')
    wordList = Tools.TupleSort(wordTuple)
    for word in wordList:
        # print word
        thisWordMec = mecab.MeCabClass(word[0])
        thisSpeechs = thisWordMec.GetAll()
        newfile.write(word[0] + ':')
        for subword in thisSpeechs:
            newfile.write(subword + ' ')
        newfile.write(str(wordTuple[word[0]]) + '\n')
    newfile.close()
 def wordCounter(self):
     if self.counter != 0:
         return self.counter
     else:
         for file in self.fileLists:
             fopen = open(file, 'r')
             content = fopen.read()
             nounInCon = mecab.MeCabClass(content).GetNoun2()
             self.counter += len(nounInCon)
         return self.counter
Beispiel #5
0
def speech(filename, newfile, threshold):
    oneWordList = Tools.content_wordList(filename, ':')
    wordListFre = Tools.content_scoreList(filename, [':', ' '])
    # print len(oneWordList),len(WordsFreList)
    allSpeech = set()
    allSpeechTuple = {}
    SpeechWordsFreTup = {}
    for n in range(len(oneWordList)):
        # try:
        # thisWord	= oneWord.split(':')[0]
        thisWord = oneWordList[n]
        thisWordMec = mecab.MeCabClass(thisWord)
        thisSpeechs = thisWordMec.GetAll3()
        try:
            thisSpeech = thisSpeechs[0]
            for x in range(len(thisSpeechs) - 1):
                thisSpeech += "+" + thisSpeechs[x + 1]
                allSpeech.add(thisSpeech)
        except:
            pass
        # if thisWord not in allSpeechTuple[thisSpeech]:
        allSpeechTuple.setdefault(thisSpeech, []).append(thisWord)
        # try:
        if thisSpeech in SpeechWordsFreTup.keys():
            SpeechWordsFreTup[thisSpeech] += wordListFre[n]
        else:
            SpeechWordsFreTup[thisSpeech] = wordListFre[n]

    newfile = open(newfile, 'wb')
    for speech in allSpeechTuple.keys():
        if SpeechWordsFreTup[speech] > threshold:
            newfile.write(speech + ':' + str(SpeechWordsFreTup[speech]) +
                          ':\n')
            if SpeechWordsFreTup[speech] > 300:
                for n in range(300):
                    try:
                        x = random.randint(0, len(allSpeechTuple) - 1)
                        newfile.write(allSpeechTuple[speech][x] + ' ')
                    except:
                        pass
                        continue
            else:
                for word in allSpeechTuple[speech]:
                    newfile.write(word + ' ')
            newfile.write('\n\n')
    newfile.close()

    # newfile2 = open('allSpeechs2.txt','wb')
    # for a in allSpeech:
    #	 try:
    #		 newfile2.write(a+' '+str(len(allSpeechTuple[a]))+'\n')
    #	 except:
    #		 print a
    # newfile2.close()
    print len(allSpeech)
def test(filename):
	global count
	fopen   = open(filename)
	content = fopen.read()

	words   = content.split('_')
	for word in words:
		mec	 = mecab.MeCabClass(word)
		allw	= mec.GetNoun2()
		if len(allw) > 3:
			count = count + 1
def chooseUse(filename, recordlist, recordfile):
    global useCount
    global uselessCount
    item = Item()
    record = Record()
    fopen = open(filename, 'r')
    content = fopen.read()[:-9]
    paras = content.split("<b>○")
    fopen.close()

    for n in range(len(paras) - 1):
        para_split = paras[n + 1].split("</b>")
        #para_split[0]:speaker
        #para_split[1]:speaking
        item.speaker = para_split[0]
        item.content = re.sub('<br>', '', para_split[1])
        exit = ''
        if len(item.content) < 300:
            thisMecab = mecab.MeCabClass(item.content)
            print item.content + "	  " + str(len(item.content))
            print "Useful:", str(useCount), "Useless:", str(
                uselessCount), "all is", str(useCount + uselessCount), '\n'
            input = raw_input("y/n/e:")
            while (input != "y" or input != "n"):
                if input == "y":
                    record.length = len(item.content)
                    record.bool = 1
                    recordlist.append([record.length, record.bool])
                    recordfile.write(str(record.bool))
                    recordfile.write(":")
                    recordfile.write(item.content)
                    recordfile.write(">\n")
                    useCount += 1
                    break
                elif input == "n":
                    record.length = len(item.content)
                    record.bool = 0
                    recordfile.write(str(record.bool))
                    recordfile.write(":")
                    recordfile.write(item.content)
                    recordfile.write(">\n")
                    uselessCount = uselessCount + 1
                    break
                elif input == 'e':
                    print "you really want to exit it?:(y/n)"
                    exit = raw_input("y/n:")
                    if exit == "y":
                        return 1
                else:
                    print "Please input again:(y/n/e):"
                    input = raw_input("y/n:")
            print "\n\n\n"
Beispiel #8
0
def nounExtractFrom2000(fromFile,toFile):
	# aimFilePath		= "/home/dream/documents/papers/code/datatest/to2000Noun/"
	fopen			= open(fromFile,'r')
	newfile			= open(toFile,'wb')
	text			= fopen.read()
	textMecab		= mecab.MeCabClass(text)
	allWordLists	= textMecab.GetAll2()
	for n in range(len(allWordLists)):
		if textMecab.isNoun2(allWordLists[n][1]):
			newfile.write(allWordLists[n][0])
			if n != len(allWordLists) - 1:
				if textMecab.isNoun2(allWordLists[n+1][1]) == 0:
					newfile.write("_")
	fopen.close()
	newfile.close()
def speechStats2():
    global TwoCooccurFile
    global ThreeCooccurFile
    # lines	 = Tools.content_lines(TwoCooccurFile)
    wordListFre = Tools.content_scoreList(TwoCooccurFile, [':', ' '])
    wordList = Tools.content_wordList(TwoCooccurFile, ':')
    allSpeech = set()
    allSpeechTuple = {}
    allSpeechTuple2 = {}
    for n in range(len(wordList) - 1):
        # print wordList[n],wordListFre[n]
        thisWordMec = mecab.MeCabClass(wordList[n])
        thisSpeechs = thisWordMec.GetAll3()
        try:
            thisSpeech = thisSpeechs[0]
        except:
            pass
        for x in range(len(thisSpeechs) - 1):
            thisSpeech += "+" + thisSpeechs[x + 1]
        try:
            if thisSpeech in allSpeechTuple.keys():
                allSpeechTuple2[thisSpeech] += wordListFre[n]
                allSpeechTuple[thisSpeech] += 1

            else:
                allSpeechTuple2[thisSpeech] = wordListFre[n]
                allSpeechTuple[thisSpeech] = 1
        except:
            print n, wordList[n]

    sortedList = Tools.TupleSort(allSpeechTuple2)
    Speech = []
    SpeechCnt = []

    Cnt = 0
    for ele in sortedList:
        Cnt += 1
        # print Cnt,':',ele[0],' ',ele[1]
        # print ele[0],'\t',ele[1]
        Speech.append(Cnt)
        SpeechCnt.append(ele[1])

    title = "Part_of_speech-frequence distribution"
    Tools.plot(Speech,
               SpeechCnt,
               title,
               xlabel="Part of speech",
               ylabel="frequence")
 def getWordBox(self):
     runCounter = 0
     for file in self.fileLists:
         runCounter += 1
         if runCounter % 200 == 0:
             os.system('clear')
             print "This is the", runCounter, "th lines"
         fopen = open(file, 'r')
         content = fopen.read()
         paras = content.split('>>>')
         for para in paras:
             nounInCon = mecab.MeCabClass(para.split('::')[-1]).GetNoun2()
             self.counter += len(nounInCon)
             for noun in nounInCon:
                 if noun not in self.wordBox:
                     self.wordBox[noun] = 1
                 else:
                     self.wordBox[noun] += 1
     return self.wordBox
Beispiel #11
0
def nounExtractFromPretreat(filename,newfilename):
	fopen			= open(filename,'r')
	text			= fopen.read()
	contents  		= text.split('>>>')
	newfile  	= open(newfilename,'wb')
	for content in contents:
		speaking  		= content.split('::')[-1]
		paras  			= speaking.split('\n')
		for para in paras:
			textMecab		= mecab.MeCabClass(para)
			allWordLists	= textMecab.GetAll2()
			for n in range(len(allWordLists)):
				if textMecab.isNoun2(allWordLists[n][1]):
					newfile.write(allWordLists[n][0])
					if n != len(allWordLists) - 1:
							if textMecab.isNoun2(allWordLists[n+1][1]) == 0:
								newfile.write("_")
			newfile.write("\n")
	newfile.close()
	fopen.close()
def isSpamMail(mail,spamDict,ordDict):
	global spamPro
	global ordPro
	global spamNum
	global ordNum
	wordBox		= {}
	wordsNum	= 0
	wordInMail	= mecab.MeCabClass(mail)
	NounWords	= wordInMail.GetNoun2()
	AdjWords	= wordInMail.GetAdj()
	for noun in NounWords:
		addWordToDict(noun,wordBox)
		wordsNum = wordsNum + 1
	for adj in AdjWords:
		addWordToDict(adj,wordBox)
		wordsNum = wordsNum + 1

	spam 		= 1
	ordinary 	= 1
	#begin to get probility of per features
	print len(wordBox)
	for key in wordBox:
		if key not in spamDict:
			spam 		= spam 	* (0.5/spamNum)**(10/len(wordBox))
		else:
			spam 		= spam 	* (spamDict[key]/spamNum)**(10/len(wordBox))

		if key not in ordDict:
			ordinary	= ordinary 	* (0.5/ordNum) **(10/len(wordBox))
		else:
			ordinary	= ordinary 	* (ordDict[key] /ordNum) **(10/len(wordBox))

	spam 	= spamPro*spam
	ordinary 	= ordPro*ordinary
	print "spam: ",spam
	print "ordinary: ",ordinary
	if spam > ordinary:
		return True
	else:
		return False
Beispiel #13
0
def main1(tofile):
	lines	   = Tools.content_lines("/home/dreamer/documents/code/eliminate/test/test_dict_for_singleWord.txt")
	wordDictTuple = {}
	for l in lines:
		if l != '':
			spl = l.split(',')
			wordDictTuple[spl[1]] = spl[2][2:-2]
	lines	   = Tools.content_lines("/home/dreamer/documents/code/eliminate/test/test.txt")
	to = open(tofile,'wb')
	for line in lines[:-1]:
		try:
			a = mecab.MeCabClass(line).GetAll()
			to.write(a[0]+','+a[1]+',')
			if a[0] not in wordDictTuple.keys() and a[1] not in wordDictTuple.keys():
				to.write('\n')
			elif a[0] not in wordDictTuple.keys():
				to.write(','+str(wordDictTuple[a[1]])+'\n')
			elif a[1] not in wordDictTuple.keys():
				to.write(str(wordDictTuple[a[0]])+','+'\n')
			else:
				to.write(str(wordDictTuple[a[0]])+','+str(wordDictTuple[a[1]])+'\n')
		except:
			print line
	to.close()
Beispiel #14
0
def generateMecabList(allWordFileName, topath):
    allWordFile = open(allWordFileName, 'r')
    content = allWordFile.read()
    lines = content.split('\n')
    OneFile = open(topath + 'OneWordsAdjacency.txt', 'wb')
    TwoFile = open(topath + 'TwoWordsAdjacency.txt', 'wb')
    ThreeFile = open(topath + 'ThreeWordsAdjacency.txt', 'wb')
    MultiFile = open(topath + 'MultiWordsAdjacency.txt', 'wb')
    AllFile = open(topath + 'AllWordsAdjacency.txt', 'wb')
    runCounter = 0
    for line in lines:
        runCounter += 1
        if runCounter % 2000 == 0:
            os.system('clear')
            print "This is the", runCounter, "th lines"
        try:
            lineEle = line.split('\t')
        except:
            continue
        word = lineEle[0]
        try:
            wordFreStr = lineEle[1]
        except:
            continue
        wordFre = int(wordFreStr)
        if wordFre > 0:
            mec = mecab.MeCabClass(word)
            wordbox = mec.GetAll()
            if len(wordbox) == 1:
                OneFile.write(word)
                AllFile.write(word)
                OneFile.write(":")
                AllFile.write(":")
                OneFile.write(wordbox[0] + " " + wordFreStr + '\n')
                AllFile.write(wordbox[0] + " " + wordFreStr + '\n')
            elif len(wordbox) == 2:
                TwoFile.write(word)
                TwoFile.write(":")
                TwoFile.write(wordbox[0] + ' ' + wordbox[1] + " " +
                              wordFreStr + '\n')
                AllFile.write(word)
                AllFile.write(":")
                AllFile.write(wordbox[0] + ' ' + wordbox[1] + " " +
                              wordFreStr + '\n')
            elif len(wordbox) == 3:
                ThreeFile.write(word)
                ThreeFile.write(":")
                ThreeFile.write(wordbox[0] + ' ' + wordbox[1] + ' ' +
                                wordbox[2] + " " + wordFreStr + '\n')
                AllFile.write(word)
                AllFile.write(":")
                AllFile.write(wordbox[0] + ' ' + wordbox[1] + ' ' +
                              wordbox[2] + " " + wordFreStr + '\n')
            elif len(wordbox) > 3:
                MultiFile.write(word)
                MultiFile.write(":")
                AllFile.write(word)
                AllFile.write(":")
                for index in range(len(wordbox) - 1):
                    MultiFile.write(wordbox[index] + ' ')
                    AllFile.write(wordbox[index] + ' ')
                MultiFile.write(wordbox[-1] + " " + wordFreStr)
                MultiFile.write('\n')
                AllFile.write(wordbox[-1] + " " + wordFreStr)
                AllFile.write('\n')
    OneFile.close()
    TwoFile.close()
    ThreeFile.close()
    MultiFile.close()
    AllFile.close()
Beispiel #15
0
def divideIntoUseAndUnuse(TwoWordsMecabList, ThreeWordsMecabList, topath):
    TwoWordsMecabListFile = open(TwoWordsMecabList, 'r')
    ThreeWordsMecabListFile = open(ThreeWordsMecabList, 'r')
    # MultiWordsMecabListFile = open(MultiWordsMecabList, 'r')

    TwousefulSpeechList = Tools.content_wordList(
        "/home/dreamer/documents/code/database/condition/SpeechUseful2.txt",
        '\t')
    ThreeusefulSpeechList = Tools.content_wordList(
        "/home/dreamer/documents/code/database/condition/SpeechUseful3.txt",
        ' \t')

    TwoUsefulFileName = topath + "TwoWordsAdjacency_afterSpeech.txt"
    ThreeUsefulFile = topath + "ThreeWordsAdjacency_afterSpeech.txt"

    TwoUsefulFile = open(TwoUsefulFileName, 'wb')
    ThreeUsefulFile = open(ThreeUsefulFile, 'wb')

    TwoWordsContent = TwoWordsMecabListFile.read()
    TwoWordSet = set()
    oneWordList = TwoWordsContent.split('\n')
    allSpeechTuple2 = {}
    for oneWord in oneWordList:
        if oneWord == '':
            continue
        thisWord = oneWord.split(':')[0]
        TwoWordSet.add(thisWord)
        thisWordMec = mecab.MeCabClass(thisWord)
        thisSpeechs = thisWordMec.GetAll3()
        try:
            thisSpeech = thisSpeechs[0]
            for n in range(len(thisSpeechs) - 1):
                thisSpeech += "+" + thisSpeechs[n + 1]
            if thisSpeech in TwousefulSpeechList:
                TwoUsefulFile.write(oneWord + '\n')
        except:
            pass

    ThreeWordsContent = ThreeWordsMecabListFile.read()
    WordList3 = ThreeWordsContent.split('\n')
    allSpeechTuple3 = {}
    for oneWord in WordList3:
        if oneWord == '':
            continue
        thisWord = oneWord.split(':')[0]
        thisWordMec = mecab.MeCabClass(thisWord)
        thisSpeechs = thisWordMec.GetAll3()
        try:
            thisSpeech = thisSpeechs[0]
            for n in range(len(thisSpeechs) - 1):
                thisSpeech += "+" + thisSpeechs[n + 1]
            if thisSpeech in ThreeusefulSpeechList:
                ThreeUsefulFile.write(oneWord + '\n')
            else:
                pass
        except:
            traceback.print_exc()
            continue

    TwoWordsMecabListFile.close()
    ThreeWordsMecabListFile.close()

    TwoUsefulFile.close()
    ThreeUsefulFile.close()
Beispiel #16
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import mecab

text = "お隣中国"
# mecab.MeCabClassForText(text)
test = mecab.MeCabClass(text)
# a = test.GetAll3()
# print b
# for b in a:

print test.PrintAll()
#	1. print
# print test.PrintNoun2AndMark()
# print test.PrintNoun2()
#	2. get (retur	n list)
# VerbNodeList	= test.GetAdj()
# mydict	= dict()
# for node in VerbNodeList:
# 	if node not in mydict:
# 		mydict[node]	= 1
# 	else:
# 		mydict[node]	+= 1
def GetConnPro():
    global TwoCooccurFile
    global ThreeCooccurFile
    TwoWordList = Tools.content_wordList(TwoCooccurFile, ':')
    TwoWordCntlist = Tools.content_scoreList(TwoCooccurFile, [':', ' '])
    # print len(AllWordList),len(AllWordCntlist),len(TwoWordList),len(TwoWordCntlist)
    # print AllWordList[-1],AllWordCntlist[-1],TwoWordList[-1],TwoWordCntlist[-1]
    TwoWordCnt = 0
    for Cnt in TwoWordCntlist:
        TwoWordCnt += Cnt

    OneWordSpeechTuple = {}
    TwoWordSpeechTuple = {}

    # print key
    for n in range(len(TwoWordList)):
        thisWordMec = mecab.MeCabClass(TwoWordList[n])
        thisSpeechs = thisWordMec.GetAll3()
        thisSpeech = thisSpeechs[0] + "+" + thisSpeechs[1]

        if thisSpeechs[0] in OneWordSpeechTuple:
            OneWordSpeechTuple[thisSpeechs[0]] += TwoWordCntlist[n]
        else:
            OneWordSpeechTuple[thisSpeechs[0]] = TwoWordCntlist[n]

        if thisSpeechs[1] in OneWordSpeechTuple:
            OneWordSpeechTuple[thisSpeechs[1]] += TwoWordCntlist[n]
        else:
            OneWordSpeechTuple[thisSpeechs[1]] = TwoWordCntlist[n]

        if thisSpeech in TwoWordSpeechTuple:
            TwoWordSpeechTuple[thisSpeech] += TwoWordCntlist[n]
        else:
            TwoWordSpeechTuple[thisSpeech] = TwoWordCntlist[n]

    # for key in TwoWordSpeechTuple.keys():
    #	 # TwoWordSpeechTuple[key] = float(TwoWordSpeechTuple[key])/float(TwoWordCnt)
    #	 print key,TwoWordSpeechTuple[key]
    # for key in OneWordSpeechTuple.keys():
    #	 # OneWordSpeechTuple[key] = float(OneWordSpeechTuple[key])/float(TwoWordCnt*2)
    #	 print key,OneWordSpeechTuple[key]

    TwoWordConnTuple = {}
    for key in TwoWordSpeechTuple.keys():
        try:
            a = key.split('+')[0]
            b = key.split('+')[1]
        except:
            print key
        try:
            c1 = OneWordSpeechTuple[a]
            c2 = OneWordSpeechTuple[b]
        except:
            return
        c12 = TwoWordSpeechTuple[key]
        p = float(c2) / float(TwoWordCnt * 2)
        p1 = float(c12) / float(c1)
        p2 = float((c2 - c12)) / float((TwoWordCnt * 2 - c1))
        try:
            score     = math.log(binomial(c12,c1,p))  + math.log(binomial(c2-c12,TwoWordCnt*2-c1,p)) \
                - math.log(binomial(c12,c1,p1)) - math.log(binomial(c2-c12,TwoWordCnt*2-c1,p2))
            # TwoWordConnTuple[key].write(wholeWord+'\t:'+subword1+' '+str(c1)+'\t:'+subword2+' '+str(c2)+'\t:'+str(c12)+'\t:'+str(score)+'\n')
            TwoWordConnTuple[key] = score
        except:
            # traceback.print_exc()
            # TwoScoreRecord.write(wholeWord+'\t:'+subword1+' '+str(c1)+'\t:'+subword2+' '+str(c2)+'\t:'+str(c12)+'\t:'+str(-9999)+'\n')
            TwoWordConnTuple[key] = -9999

    TwoWordConnList = Tools.TupleSort(TwoWordConnTuple)
    Speech = []
    SpeechCnt = []
    Cnt = 0
    for ele in TwoWordConnList:
        Cnt += 1
        print Cnt, ':', ele[0], ele[1], TwoWordSpeechTuple[ele[0]]
        Speech.append(Cnt)
        SpeechCnt.append(ele[1])

    Tools.plot(Speech, SpeechCnt)