def build(master_dict, path, l, fh):
	master = ()
	master = sorted(master_dict)

	m_dict = OrderedDict()
	for item in master:
		m_dict[item] = 0	
	
	fhl=open(path,"r")
	lines = fhl.read().split()
	#lines = re.sub("[()+.,\']",'',lines)
	#words = nltk.tokenize.word_tokenize(lines)
	bi_lines = bigrams(lines)	
	temp = FreqDist(bi_lines)
	fhl.close()

	#temp = get_bigram_file(path)
	for key in m_dict.iterkeys():
		if temp.has_key(key):
			m_dict[key] = temp[key]
		else:
			m_dict[key] = 0
	#m_dict.update(temp)
	values = list()
	for val in m_dict.itervalues():
		values.append(str(val))
	#print len(values)	
	fh.write(l+","+",".join(values))
	fh.write('\n')
Example #2
0
class termBased(AbstractGenerativeModel):

    #It requires to pass analyser that will break document into tokens.
    #It can also remove stopwords and make normalization for words.
    def __init__(self, analyser):
        self.analyser = analyser
        print("Term Based ")

    def generateProbabilityDistribution(self, document_list):
        tokens = []
        for doc in document_list:
            tokens += self.analyser(doc)
        self.freqDist = FreqDist(tokens)

    def getProbabilityDistribution(self):
        return self.freqDist

    def probOfDocument(self, document):
        tokens = self.analyser(document)
        prob = 1.0
        for token in tokens:
            if self.freqDist.has_key(token):
                prob *= self.freqDist.freq(token)
        if prob == 1.0:
            return 0.0
        return prob
 def featureList(corpus):
     featList = []
     for post in corpus:
         listItem = [0]*noFeat
         fileFreqDist = FreqDist()
         fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
         
         i =0
         for key in trainKeys:
             if fileFreqDist.has_key(key):
                 listItem[i] = fileFreqDist.get(key)
             i=i+1
             
         featList.append(listItem)
         
     return featList
Example #4
0
    def featureList(corpus):
        featList = []
        for post in corpus:
            listItem = [0] * noFeat
            fileFreqDist = FreqDist()
            fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))

            i = 0
            for key in trainKeys:
                if fileFreqDist.has_key(key):
                    listItem[i] = fileFreqDist.get(key)
                i = i + 1

            featList.append(listItem)

        return featList
Example #5
0
def featureList(corpus):
    featList = []
    for trFile in corpus.fileids():
        listItem = [0]*noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(corpus.words(trFile))
        
        i =0
        for key in trainKeys:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i=i+1
            
        featList.append(listItem)
        
    return featList
def createFeatures(sentVect, ordList):
    
    noFeat = len(ordList)
    
    featList = []
    for post in sentVect:
        listItem = [0]*noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
            
        i =0
        for key in ordList:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i=i+1
                
        featList.append(listItem)
            
    return featList
Example #7
0
def createFeatures(sentVect, ordList):

    noFeat = len(ordList)

    featList = []
    for post in sentVect:
        listItem = [0] * noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))

        i = 0
        for key in ordList:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i = i + 1

        featList.append(listItem)

    return featList