Esempio n. 1
0
    def load(self, g):
        dictpath = '/home/pizza/proj/spill-chick/data/cmudict/cmudict.0.7a'
        # extract file if necessary
        if not os.path.exists(dictpath):
            with open(dictpath, 'wb') as dst:
                with gzip.open(dictpath + '.gz', 'rb') as src:
                    dst.write(src.read())
        # TODO: loading this ~130,000 line dictionary in python represents the majority
        # of the program's initialization time. move it over to C.
        with open(dictpath, 'r') as f:
            for line in f:
                if line.startswith(';;;'):
                    continue
                line = line.decode('utf8')
                line = line.strip().lower()
                word, phon = line.split('  ')
                """
				skip any words that do not appear in our ngrams.
				this makes a significant difference when trying to reconstruct phrases
				phonetically; small decreases in terms have large decreases in products.
				note: you may think that every word in a dictionary would appear
				at least once in a large corpus, but we truncate corpus n-grams at a
				certain minimum frequency which may exclude very obscure words from ultimately
				appearing at all.
				"""

                # TODO: what i really should do is eliminate all words that appear less
                # than some statistically significant time; the vast majority of the
                # phonetic phrases I currently try are filled with short obscure words
                # and are a complete waste
                # FIXME: instead of hard-coding frequency, calculate statistically
                if word.count("'") == 0 and g.freqs(word) < 500:
                    continue
                """
				implement a very rough phonic fuzzy-matching
				phonic codes consist of a list of sounds such as:
					REVIEW  R IY2 V Y UW1
				we simplify this to
					REVIEW  R I V Y U
				this allows words with close but imperfectly sounding matches to
				be identified. for example:
					REVUE   R IH0 V Y UW1
					REVIEW  R IY2 V Y UW1
				is close but not a perfect match. after regex:
					REVUE  R I V Y U
					REVIEW R I V Y U
				"""
                phon = re.sub('(\S)(\S+)', r'\1', phon)
                # now merge leading vowels except 'o' and 'u'
                if len(phon) > 1:
                    phon = re.sub('^[aei]', '*', phon)
                self.words.add(word)
                self.word[word].append(phon)
                toks = tokenize(word)
                self.phon[phon].append(toks)
Esempio n. 2
0
	def load(self, g):
		dictpath ='/home/pizza/proj/spill-chick/data/cmudict/cmudict.0.7a'
		# extract file if necessary
		if not os.path.exists(dictpath):
			with open(dictpath, 'wb') as dst:
				with gzip.open(dictpath + '.gz', 'rb') as src:
					dst.write(src.read())
		# TODO: loading this ~130,000 line dictionary in python represents the majority
		# of the program's initialization time. move it over to C.
		with open(dictpath, 'r') as f:
			for line in f:
				if line.startswith(';;;'):
					continue
				line = line.decode('utf8')
				line = line.strip().lower()
				word, phon = line.split('  ')
				"""
				skip any words that do not appear in our ngrams.
				this makes a significant difference when trying to reconstruct phrases
				phonetically; small decreases in terms have large decreases in products.
				note: you may think that every word in a dictionary would appear
				at least once in a large corpus, but we truncate corpus n-grams at a
				certain minimum frequency which may exclude very obscure words from ultimately
				appearing at all.
				"""

				# TODO: what i really should do is eliminate all words that appear less
				# than some statistically significant time; the vast majority of the
				# phonetic phrases I currently try are filled with short obscure words
				# and are a complete waste
				# FIXME: instead of hard-coding frequency, calculate statistically
				if word.count("'") == 0 and g.freqs(word) < 500:
					continue
				"""
				implement a very rough phonic fuzzy-matching
				phonic codes consist of a list of sounds such as:
					REVIEW  R IY2 V Y UW1
				we simplify this to
					REVIEW  R I V Y U
				this allows words with close but imperfectly sounding matches to
				be identified. for example:
					REVUE   R IH0 V Y UW1
					REVIEW  R IY2 V Y UW1
				is close but not a perfect match. after regex:
					REVUE  R I V Y U
					REVIEW R I V Y U
				"""
				phon = re.sub('(\S)(\S+)', r'\1', phon)
				# now merge leading vowels except 'o' and 'u'
				if len(phon) > 1:
					phon = re.sub('^[aei]', '*', phon)
				self.words.add(word)
				self.word[word].append(phon)
				toks = tokenize(word)
				self.phon[phon].append(toks)
Esempio n. 3
0
	def tokenize(self, f):
		self.lines = []
		self.tok = []
		for lcnt,line in enumerate(f):
			self.lines.append(line)
			line = line.lower() # used for index below
			toks = gram.tokenize(line)
			if toks and toks[-1] == '\n':
				toks.pop()
			#self.docwords.update(toks) # add words to local dictionary
			tpos = 0
			ll = []
			for t in toks:
				tpos = line.index(t, tpos)
				ll.append((t, lcnt, len(ll), tpos))
				tpos += len(t)
			self.tok.append(ll)
Esempio n. 4
0
 def tokenize(self, f):
     self.lines = []
     self.tok = []
     for lcnt, line in enumerate(f):
         self.lines.append(line)
         line = line.lower()  # used for index below
         toks = gram.tokenize(line)
         if toks and toks[-1] == '\n':
             toks.pop()
         #self.docwords.update(toks) # add words to local dictionary
         tpos = 0
         ll = []
         for t in toks:
             tpos = line.index(t, tpos)
             ll.append((t, lcnt, len(ll), tpos))
             tpos += len(t)
         self.tok.append(ll)