def make_concept_uri(text, lang, disambiguation=None): text = ftfy.ftfy(text).strip() if disambiguation is None: text, disambiguation = handle_disambig(text) if disambiguation is not None: if isinstance(disambiguation, str): disambiguation = disambiguation.decode("utf-8") disambiguation = ftfy.ftfy(disambiguation) if lang == "en": normalized = english.normalize(text) elif lang == "ja" and disambiguation is not None: match = re.search(r"\((.*?)\)", disambiguation) if match: parenthesized = match.group(1) pos, rest = disambiguation.split("/", 1) if parenthesized in JAPANESE_PARTS_OF_SPEECH: pos = JAPANESE_PARTS_OF_SPEECH[parenthesized] else: pos = "n" disambiguation = pos + "/" + re.sub(r"\s*\((.*?)\)\s*", "", rest) normalized = preprocess_text(text).lower() else: normalized = preprocess_text(text).lower() if disambiguation is not None: disambiguation = disambiguation.strip().replace(" ", "_").lower() if disambiguation: return "/c/%s/%s/%s" % (lang, normalized.replace(" ", "_"), disambiguation) else: return "/c/%s/%s" % (lang, normalized.replace(" ", "_"))
def make_concept_uri(text, lang, disambiguation=None): text = ftfy.ftfy(text) if disambiguation is None: text, disambiguation = handle_disambig(text) if disambiguation is not None: if isinstance(disambiguation, str): disambiguation = disambiguation.decode('utf-8') disambiguation = ftfy.ftfy(disambiguation) if lang == 'en': normalized = english.normalize(text) elif lang == 'ja' and disambiguation is not None: match = re.search(r'\((.*?)\)', disambiguation) if match: parenthesized = match.group(1) pos, rest = disambiguation.split('/', 1) if parenthesized in JAPANESE_PARTS_OF_SPEECH: pos = JAPANESE_PARTS_OF_SPEECH[parenthesized] else: pos = 'n' disambiguation = pos + '/' + re.sub(r'\s*\((.*?)\)\s*', '', rest) normalized = preprocess_text(text).lower() else: normalized = preprocess_text(text).lower() if disambiguation is not None: disambiguation = disambiguation.replace(' ', '_') if disambiguation: return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambiguation) else: return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
def _load_new_stream(cls, stream): worddict = defaultdict(int) for line in stream: word, freq = line.split(u',') word = preprocess_text(word).lower() worddict[word] += float(freq) return cls(dict(worddict))
def analyze(self, text): """ Run text through the external process, and get a list of lists ("records") that contain the analysis of each word. """ try: text = UNSAFE_RE.sub('', preprocess_text(text)).strip() if not text: return [] chunks = text.split('\n') results = [] for chunk_text in chunks: if chunk_text.strip(): text = chunk_text.encode('utf-8') self.send_input(text + '\n') #self.input_log.write(text+'\n') out_line = '' while True: out_line = self.receive_output_line() #self.output_log.write(out_line) out_line = out_line.decode('utf-8') if out_line == u'\n': break record = out_line.strip(u'\n').split(u' ') results.append(record) return results except ProcessError: self.restart_process() return self.analyze(text)
def get_frequency(word, lang, default_freq=0, scale=1e9): """ Looks up a word's frequency in our preferred frequency list for the given language. >>> int(get_frequency('the', 'en', scale=42)) 42 >>> int(get_frequency('normalization', 'en')) 19566 >>> int(get_frequency('Normalization', 'en')) 19566 >>> get_frequency('weirdification', 'en', 100.0) 100.0 """ try: freqs = get_wordlist(lang) except ZeroDivisionError: return default_freq factor = scale / freqs.max_freq() if " " in word: raise ValueError("get_frequency only can only look up single words, " "but %r contains a space" % word) lookup = preprocess_text(word).lower() return factor * freqs[lookup] + default_freq
def word_frequency(word, default_freq=0): """ Looks up the word's frequency in a modified version of the Google Books 1-grams list. The characters may be in any case (they'll be case-smashed to uppercase) and may include non-ASCII letters in UTF-8 or Unicode. Words appear in the list if they meet these criteria, which improve the compactness and accuracy of the list: - They consist entirely of letters, digits and/or ampersands - They contain at least one ASCII letter - They appear at least 1000 times in Google Books OR (they appear at least 40 times in Google Books and also appear in Wiktionary or WordNet) Apostrophes are assumed to be at the edge of the word, in which case they'll be stripped like they were in the Google data, or in the special token "n't" which is treated as "not". This matches the output of the tokenize() function. >>> word_frequency('normalization') 223058.0 >>> word_frequency('budap', default_freq=100.) 100.0 """ freqs = Wordlist.load('google-unigrams.txt') if " " in word: raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word) word = preprocess_text(word.strip("'")).upper() if word == "N'T": word = 'NOT' return freqs.get(word, default_freq)
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ text = preprocess_text(text).lower() n_chunks = (len(text)+1024)//1024 results = [] for chunk in xrange(n_chunks): chunk_text = text[chunk*1024:(chunk+1)*1024].encode(self.mecab_encoding) self.mecab.stdin.write(chunk_text+'\n') #self.input_log.write(text+'\n') out_line = '' while True: out_line = self.mecab.stdout.readline() #self.output_log.write(out_line) out_line = out_line.decode(self.mecab_encoding) if out_line == u'EOS\n': break word, info = out_line.strip(u'\n').split(u'\t') record = [word] + info.split(u',') # special case for detecting nai -> n if record[0] == u'ん' and record[5] == u'不変化型': record[7] = record[1] = u'ない' results.append(record) return results
def get_frequency(word, lang, default_freq=0, scale=1e9): """ Looks up a word's frequency in our preferred frequency list for the given language. >>> int(get_frequency('the', 'en', scale=42)) 42 >>> int(get_frequency('normalization', 'en')) 19566 >>> int(get_frequency('Normalization', 'en')) 19566 >>> get_frequency('weirdification', 'en', 100.0) 100.0 """ try: freqs = get_wordlist(lang) except ZeroDivisionError: return default_freq factor = scale / freqs.max_freq if " " in word: raise ValueError("get_frequency only can only look up single words, " "but %r contains a space" % word) lookup = preprocess_text(word).lower() return factor * freqs[lookup] + default_freq
def _load_new_stream(cls, stream): worddict = defaultdict(int) mode = None # We need to distinguish between two modes, to handle old and new # files: # 1. comma-separated linear frequency values # 2. tab-separated logarithmic values in dB for line in stream: if mode is None: if '\t' in line: mode = 2 elif ',' in line: mode = 1 else: raise ValueError( "I don't recognize the format of this wordlist file.") if mode == 1: word, freq = line.rstrip().split(',') freq = float(freq) elif mode == 2: word, freq = line.rstrip().split('\t') freq = 10**(float(freq) / 10) word = preprocess_text(word).lower() worddict[word] += freq return cls(dict(worddict))
def _load_stream(cls, stream): worddict = {} mode = None # We need to distinguish between two modes, to handle old and new # files: # 1. comma-separated linear frequency values # 2. tab-separated logarithmic values in dB for line in stream: if mode is None: if '\t' in line: mode = 2 elif ',' in line: mode = 1 else: raise ValueError( "I don't recognize the format of this wordlist file." ) if mode == 1: word, freq = line.rstrip().split(',') freq = float(freq) elif mode == 2: word, freq = line.rstrip().split('\t') freq = 10**(float(freq)/10) word = preprocess_text(word).lower() worddict[word] = freq return cls(worddict)
def word_frequency(self, word, default_freq=0): """ Looks up the word's frequency in the Leeds Internet corpus for the appropriate language. FIXME: this returns 0 for words that stem differently in FreeLing when we use FreeLing frequencies, and that's most of the words """ freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang) word = self.snowball_stem(word) if " " in word: raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word) word = preprocess_text(word.strip("'")).lower() return freqs.get(word, default_freq)
def word_frequency(self, word, default_freq=0): """ Looks up the word's frequency in the Leeds Internet corpus for the appropriate language. FIXME: this returns 0 for words that stem differently in FreeLing when we use FreeLing frequencies, and that's most of the words """ freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang) word = self.snowball_stem(word) if " " in word: raise ValueError( "word_frequency only can only look up single words, but %r contains a space" % word) word = preprocess_text(word.strip("'")).lower() return freqs.get(word, default_freq)
def tag_and_stem(text): """ Returns a list of (stem, tag, token) triples: - stem: the word's uninflected form - tag: the word's part of speech - token: the original word, so we can reconstruct it later """ tokens = tokenize_list(preprocess_text(text)) tagged = nltk.pos_tag(tokens) out = [] for token, tag in tagged: if token in BRACKET_DIC: out.append((token, BRACKET_DIC[token], token)) else: stem = morphy_stem(token, tag) out.append((stem, tag, token)) return out
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ try: self.process # make sure things are loaded text = preprocess_text(text).replace('\n', ' ').lower() n_chunks = (len(text) + 1024) // 1024 results = [] for chunk in xrange(n_chunks): chunk_text = text[chunk * 1024:(chunk + 1) * 1024].encode( self.mecab_encoding) self.send_input(chunk_text + '\n') #self.input_log.write(text+'\n') out_line = '' while True: out_line = self.receive_output_line() #self.output_log.write(out_line) out_line = out_line.decode(self.mecab_encoding) if out_line == u'EOS\n': break word, info = out_line.strip(u'\n').split(u'\t') record_parts = [word] + info.split(u',') # Pad the record out to have 10 parts if it doesn't record_parts += [None] * (10 - len(record_parts)) record = MeCabRecord(*record_parts) # special case for detecting nai -> n if record.surface == u'ん' and record.conjugation == u'不変化型': # rebuild the record so that record.root is 'nai' record_parts[MeCabRecord._fields.index('root')] = u'ない' record = MeCabRecord(*record_parts) results.append(record) return results except ProcessError: self.restart_process() return self.analyze(text)
def get_frequency(word, lang, default_freq=0): """ Looks up a word's frequency in our preferred frequency list for the given language. """ word = preprocess_text(word) if lang == 'en': filename = 'google-unigrams.txt' word = word.upper() else: filename = 'leeds-internet-%s.txt' % lang word = word.lower() freqs = Wordlist.load(filename) if " " in word: raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word) # roman characters are in lowercase return freqs.get(word, default_freq)
def normalize_list(text): """ Get a list of word stems that appear in the text. Stopwords and an initial 'to' will be stripped, unless this leaves nothing in the stem. >>> normalize_list('the dog') [u'dog'] >>> normalize_list('big dogs') [u'big', u'dog'] >>> normalize_list('the') [u'the'] """ text = preprocess_text(text) pieces = [morphy_stem(word) for word in tokenize_list(text)] pieces = [piece for piece in pieces if good_lemma(piece)] if not pieces: return [text] if pieces[0] == 'to': pieces = pieces[1:] return pieces
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ try: self.process # make sure things are loaded text = preprocess_text(text).replace('\n', '').lower() n_chunks = (len(text) + 1024) // 1024 results = [] for chunk in xrange(n_chunks): chunk_text = text[chunk * 1024:(chunk + 1) * 1024].encode( self.mecab_encoding) self.send_input(chunk_text + '\n') #self.input_log.write(text+'\n') out_line = '' while True: out_line = self.receive_output_line() #self.output_log.write(out_line) out_line = out_line.decode(self.mecab_encoding) if out_line == u'EOS\n': break word, info = out_line.strip(u'\n').split(u'\t') record = [word] + info.split(u',') # special case for detecting nai -> n if record[0] == u'ん' and record[5] == u'不変化型': record[7] = u'ない' results.append(record) return results except ProcessError: self.restart_process() return self.analyze(text)
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ try: self.process # make sure things are loaded text = preprocess_text(text).lower() n_chunks = (len(text) + 1024) // 1024 results = [] for chunk in xrange(n_chunks): chunk_text = text[chunk * 1024 : (chunk + 1) * 1024].encode(self.mecab_encoding) self.send_input(chunk_text + "\n") # self.input_log.write(text+'\n') out_line = "" while True: out_line = self.receive_output_line() # self.output_log.write(out_line) out_line = out_line.decode(self.mecab_encoding) if out_line == u"EOS\n": break word, info = out_line.strip(u"\n").split(u"\t") record = [word] + info.split(u",") # special case for detecting nai -> n if record[0] == u"ん" and record[5] == u"不変化型": record[7] = record[1] = u"ない" results.append(record) return results except ProcessError: self.restart_process() return self.analyze(text)
def word_frequency(word, default_freq=0): """ Looks up the word's frequency in a modified version of the Google Books 1-grams list. The characters may be in any case (they'll be case-smashed to uppercase) and may include non-ASCII letters in UTF-8 or Unicode. Words appear in the list if they meet these criteria, which improve the compactness and accuracy of the list: - They consist entirely of letters, digits and/or ampersands - They contain at least one ASCII letter - They appear at least 1000 times in Google Books OR (they appear at least 40 times in Google Books and also appear in Wiktionary or WordNet) Apostrophes are assumed to be at the edge of the word, in which case they'll be stripped like they were in the Google data, or in the special token "n't" which is treated as "not". This matches the output of the tokenize() function. >>> word_frequency('normalization') 223058.0 >>> word_frequency('budap', default_freq=100.) 100.0 """ freqs = Wordlist.load('google-unigrams.txt') if " " in word: raise ValueError("word_frequency only can only look up single words, " "but %r contains a space" % word) word = preprocess_text(word.strip("'")).lower() if word == "n't": word = 'not' return freqs.get(word, default_freq)