Ejemplo n.º 1
0
# compare the output of our algorithm with
# ignore the length 0 string after the last period
sentences = cleantext.split('.')[:-1]

# remove spaces from excerpt
nospaces = re.sub('[^a-zA-Z\'.]', '', text)
# ignore the length 0 string after the last period
nospace_sentences = nospaces.split('.')[:-1]

# tally up sentences that are correct
tallyNaiveProb = 0
tallyTransProb = 0

# get frequencies from basetext
(freq_dict, normFactor) = helpers.getFreq("alphanumeric.txt")
(transition_freq_dict, transNormFactor) = helpers.getTransitionFreq("alphanumeric.txt")

# iterate over sentences
for (idx, sentence) in enumerate(nospace_sentences):
	# default max word length as 15 
	mytext = models.NoSpaceText(sentence, 15)

	# set frequency dictionaries
	mytext.freq_dict = freq_dict
	mytext.normFactor = normFactor
	mytext.transition_freq_dict = transition_freq_dict
	mytext.transNormFactor = transNormFactor

	# find segmentation using naive frequencies
	mytext.dpGreedy()
	bestSeg = mytext.getBestSeg()
Ejemplo n.º 2
0
	def initalizeFrequencies(basetext = "alphanumeric.txt"):
		(self.freq_dict, self.normFactor) = helpers.getFreq(basetext)
		(self.transition_freq_dict, self.transNormFactor) = helpers.getTransitionFreq(basetext)
		return (self.freq_dict, self.normFactor, self.transition_freq_dict, self.transNormFactor)