def tokenize(self, text): r""" Tokenizing a sentence inserts spaces in such a way that it separates punctuation from words, splits up contractions, and generally does what a lot of natural language tools (especially parsers) expect their input to do. >>> en_nl.tokenize("Time is an illusion. Lunchtime, doubly so.") u'Time is an illusion . Lunchtime , doubly so .' >>> untok = ''' ... "Very deep," said Arthur, "you should send that in to the ... Reader's Digest. They've got a page for people like you." ... ''' >>> tok = en_nl.tokenize(untok) >>> tok u"`` Very deep , '' said Arthur , `` you should send that in to the Reader 's Digest . They 've got a page for people like you . ''" >>> en_nl.untokenize(tok) u'"Very deep," said Arthur, "you should send that in to the Reader\'s Digest. They\'ve got a page for people like you."' >>> en_nl.untokenize(tok) == untok.replace('\n', ' ').strip() True """ step0 = preprocess_text(text).replace('\r', '').replace('\n', ' ') cur = step0.replace(" '", " ` ").replace("'", " '").replace("n 't", " n't").replace("cannot", "can not") for regex, replacement in compiled_tokenizer_regexes: cur = regex.sub(replacement, cur) return cur.strip()
def canonicalize(self, word): """ Reduce equivalent characters to a canonical form. In a EuroNL, by default, this puts those characters in lowercase. """ return preprocess_text(word).lower()
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ text = preprocess_text(text).lower() n_chunks = (len(text) + 1024) // 1024 results = [] for chunk in xrange(n_chunks): chunk_text = text[chunk * 1024:(chunk + 1) * 1024].encode( self.mecab_encoding) self.mecab.stdin.write(chunk_text + '\n') #self.input_log.write(text+'\n') out_line = '' while True: out_line = self.mecab.stdout.readline() #self.output_log.write(out_line) out_line = out_line.decode(self.mecab_encoding) if out_line == u'EOS\n': break word, info = out_line.strip(u'\n').split(u'\t') record = [word] + info.split(u',') # special case for detecting nai -> n if record[0] == u'ん' and record[5] == u'不変化型': record[7] = record[1] = u'ない' results.append(record) return results
def analyze(self, text): """ Runs a line of text through MeCab, and returns the results as a list of lists ("records") that contain the MeCab analysis of each word. """ text = preprocess_text(text).lower() n_chunks = (len(text)+1024)//1024 results = [] for chunk in xrange(n_chunks): chunk_text = text[chunk*1024:(chunk+1)*1024].encode(self.mecab_encoding) self.mecab.stdin.write(chunk_text+'\n') #self.input_log.write(text+'\n') out_line = '' while True: out_line = self.mecab.stdout.readline() #self.output_log.write(out_line) out_line = out_line.decode(self.mecab_encoding) if out_line == u'EOS\n': break word, info = out_line.strip(u'\n').split(u'\t') record = [word] + info.split(u',') # special case for detecting nai -> n if record[0] == u'ん' and record[5] == u'不変化型': record[7] = record[1] = u'ない' results.append(record) return results
def tokenize(self, text): r""" Tokenizing a sentence inserts spaces in such a way that it separates punctuation from words, splits up contractions, and generally does what a lot of natural language tools (especially parsers) expect their input to do. >>> en_nl.tokenize("Time is an illusion. Lunchtime, doubly so.") u'Time is an illusion . Lunchtime , doubly so .' >>> untok = ''' ... "Very deep," said Arthur, "you should send that in to the ... Reader's Digest. They've got a page for people like you." ... ''' >>> tok = en_nl.tokenize(untok) >>> tok u"`` Very deep , '' said Arthur , `` you should send that in to the Reader 's Digest . They 've got a page for people like you . ''" >>> en_nl.untokenize(tok) u'"Very deep," said Arthur, "you should send that in to the Reader\'s Digest. They\'ve got a page for people like you."' >>> en_nl.untokenize(tok) == untok.replace('\n', ' ').strip() True """ step0 = preprocess_text(text).replace('\r', '').replace('\n', ' ') cur = step0.replace(" '", " ` ").replace("'", " '").replace( "n 't", " n't").replace("cannot", "can not") for regex, replacement in compiled_tokenizer_regexes: cur = regex.sub(replacement, cur) return cur.strip()
def read_csv_columns(csv_file, columns, header = True): reader = csv.reader(codecs.open(csv_file, 'r', 'latin-1'), delimiter=',') if header: next(reader) data = [] for row in reader: fields = [] for c in columns: txt = re.sub(r'^"|"$','',row[c]).decode('unicode-escape') try: txt = txt.decode('unicode-escape') except: pass unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore') fields.append(simplenlp.preprocess_text(txt)) data.append(fields) return np.array(data)
def test_preprocess(): assert simplenlp.preprocess_text( 'This is a\000 test.\r\n') == 'This is a test. '
def test_preprocess(): assert simplenlp.preprocess_text("This is a\000 test.\r\n") == "This is a test. "