def load(self, g): dictpath = '/home/pizza/proj/spill-chick/data/cmudict/cmudict.0.7a' # extract file if necessary if not os.path.exists(dictpath): with open(dictpath, 'wb') as dst: with gzip.open(dictpath + '.gz', 'rb') as src: dst.write(src.read()) # TODO: loading this ~130,000 line dictionary in python represents the majority # of the program's initialization time. move it over to C. with open(dictpath, 'r') as f: for line in f: if line.startswith(';;;'): continue line = line.decode('utf8') line = line.strip().lower() word, phon = line.split(' ') """ skip any words that do not appear in our ngrams. this makes a significant difference when trying to reconstruct phrases phonetically; small decreases in terms have large decreases in products. note: you may think that every word in a dictionary would appear at least once in a large corpus, but we truncate corpus n-grams at a certain minimum frequency which may exclude very obscure words from ultimately appearing at all. """ # TODO: what i really should do is eliminate all words that appear less # than some statistically significant time; the vast majority of the # phonetic phrases I currently try are filled with short obscure words # and are a complete waste # FIXME: instead of hard-coding frequency, calculate statistically if word.count("'") == 0 and g.freqs(word) < 500: continue """ implement a very rough phonic fuzzy-matching phonic codes consist of a list of sounds such as: REVIEW R IY2 V Y UW1 we simplify this to REVIEW R I V Y U this allows words with close but imperfectly sounding matches to be identified. for example: REVUE R IH0 V Y UW1 REVIEW R IY2 V Y UW1 is close but not a perfect match. after regex: REVUE R I V Y U REVIEW R I V Y U """ phon = re.sub('(\S)(\S+)', r'\1', phon) # now merge leading vowels except 'o' and 'u' if len(phon) > 1: phon = re.sub('^[aei]', '*', phon) self.words.add(word) self.word[word].append(phon) toks = tokenize(word) self.phon[phon].append(toks)
def load(self, g): dictpath ='/home/pizza/proj/spill-chick/data/cmudict/cmudict.0.7a' # extract file if necessary if not os.path.exists(dictpath): with open(dictpath, 'wb') as dst: with gzip.open(dictpath + '.gz', 'rb') as src: dst.write(src.read()) # TODO: loading this ~130,000 line dictionary in python represents the majority # of the program's initialization time. move it over to C. with open(dictpath, 'r') as f: for line in f: if line.startswith(';;;'): continue line = line.decode('utf8') line = line.strip().lower() word, phon = line.split(' ') """ skip any words that do not appear in our ngrams. this makes a significant difference when trying to reconstruct phrases phonetically; small decreases in terms have large decreases in products. note: you may think that every word in a dictionary would appear at least once in a large corpus, but we truncate corpus n-grams at a certain minimum frequency which may exclude very obscure words from ultimately appearing at all. """ # TODO: what i really should do is eliminate all words that appear less # than some statistically significant time; the vast majority of the # phonetic phrases I currently try are filled with short obscure words # and are a complete waste # FIXME: instead of hard-coding frequency, calculate statistically if word.count("'") == 0 and g.freqs(word) < 500: continue """ implement a very rough phonic fuzzy-matching phonic codes consist of a list of sounds such as: REVIEW R IY2 V Y UW1 we simplify this to REVIEW R I V Y U this allows words with close but imperfectly sounding matches to be identified. for example: REVUE R IH0 V Y UW1 REVIEW R IY2 V Y UW1 is close but not a perfect match. after regex: REVUE R I V Y U REVIEW R I V Y U """ phon = re.sub('(\S)(\S+)', r'\1', phon) # now merge leading vowels except 'o' and 'u' if len(phon) > 1: phon = re.sub('^[aei]', '*', phon) self.words.add(word) self.word[word].append(phon) toks = tokenize(word) self.phon[phon].append(toks)
def tokenize(self, f): self.lines = [] self.tok = [] for lcnt,line in enumerate(f): self.lines.append(line) line = line.lower() # used for index below toks = gram.tokenize(line) if toks and toks[-1] == '\n': toks.pop() #self.docwords.update(toks) # add words to local dictionary tpos = 0 ll = [] for t in toks: tpos = line.index(t, tpos) ll.append((t, lcnt, len(ll), tpos)) tpos += len(t) self.tok.append(ll)
def tokenize(self, f): self.lines = [] self.tok = [] for lcnt, line in enumerate(f): self.lines.append(line) line = line.lower() # used for index below toks = gram.tokenize(line) if toks and toks[-1] == '\n': toks.pop() #self.docwords.update(toks) # add words to local dictionary tpos = 0 ll = [] for t in toks: tpos = line.index(t, tpos) ll.append((t, lcnt, len(ll), tpos)) tpos += len(t) self.tok.append(ll)