def build(master_dict, path, l, fh): master = () master = sorted(master_dict) m_dict = OrderedDict() for item in master: m_dict[item] = 0 fhl=open(path,"r") lines = fhl.read().split() #lines = re.sub("[()+.,\']",'',lines) #words = nltk.tokenize.word_tokenize(lines) bi_lines = bigrams(lines) temp = FreqDist(bi_lines) fhl.close() #temp = get_bigram_file(path) for key in m_dict.iterkeys(): if temp.has_key(key): m_dict[key] = temp[key] else: m_dict[key] = 0 #m_dict.update(temp) values = list() for val in m_dict.itervalues(): values.append(str(val)) #print len(values) fh.write(l+","+",".join(values)) fh.write('\n')
class termBased(AbstractGenerativeModel): #It requires to pass analyser that will break document into tokens. #It can also remove stopwords and make normalization for words. def __init__(self, analyser): self.analyser = analyser print("Term Based ") def generateProbabilityDistribution(self, document_list): tokens = [] for doc in document_list: tokens += self.analyser(doc) self.freqDist = FreqDist(tokens) def getProbabilityDistribution(self): return self.freqDist def probOfDocument(self, document): tokens = self.analyser(document) prob = 1.0 for token in tokens: if self.freqDist.has_key(token): prob *= self.freqDist.freq(token) if prob == 1.0: return 0.0 return prob
def featureList(corpus): featList = [] for post in corpus: listItem = [0]*noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i =0 for key in trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i=i+1 featList.append(listItem) return featList
def featureList(corpus): featList = [] for post in corpus: listItem = [0] * noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i = 0 for key in trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i = i + 1 featList.append(listItem) return featList
def featureList(corpus): featList = [] for trFile in corpus.fileids(): listItem = [0]*noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(corpus.words(trFile)) i =0 for key in trainKeys: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i=i+1 featList.append(listItem) return featList
def createFeatures(sentVect, ordList): noFeat = len(ordList) featList = [] for post in sentVect: listItem = [0]*noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i =0 for key in ordList: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i=i+1 featList.append(listItem) return featList
def createFeatures(sentVect, ordList): noFeat = len(ordList) featList = [] for post in sentVect: listItem = [0] * noFeat fileFreqDist = FreqDist() fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post)) i = 0 for key in ordList: if fileFreqDist.has_key(key): listItem[i] = fileFreqDist.get(key) i = i + 1 featList.append(listItem) return featList