class KeywordExtractor(object): def __init__(self): self._kwex = DefaultKeywordExtractor() self._lem = DefaultLemmatizer() self._recaser = DefaultRecaser() self._tokenizer = DefaultTokenizer() def extract(self, sent): keywords = self._kwex.extract( map(self._recaser.recase, map(self._lem.lemmatize, map(str.lower, self._tokenizer.tokenize(sent)))) ) return keywords def extract_weighted(self, sent): keywords = self._kwex.extract_weighted( map(self._recaser.recase, map(self._lem.lemmatize, map(str.lower, self._tokenizer.tokenize(sent)))) ) return keywords @staticmethod def serve(params): global keyword_extractor if "keyword_extractor" not in globals(): keyword_extractor = KeywordExtractor() return {"output": keyword_extractor.extract(params['input'])}
def _test_terminals(self): testcase = "i cook rice." tk = DefaultTokenizer() p = StanfordCFGParser() tree = p.parse(tk.tokenize(testcase)) print p.extract_terminals(tree)
def __init__(self, network, vec=None, parser=None, pooling_size=15, regs_allowed=5): self._vec = vec self._network = network self._parser = StanfordCFGParser() if not parser else parser self._tokenizer = DefaultTokenizer() self._recaser = DefaultRecaser() self.pooling_size = pooling_size self.regs_allowed = regs_allowed
def _test_parse(self): testcase = "One difference from C: I wrote a little wrapper around malloc/free, cymem." tk = DefaultTokenizer() p = StanfordCFGParser() tree = p.parse(tk.tokenize(testcase)) print tree
def test_bath_parse(self): tk = DefaultTokenizer() p = BatchStanfordCFGParser() testcases = ["it turns out good", "it will work (so it is)"] tokenized_cases = [] for case in testcases: tokenized_cases.append(tk.tokenize(case)) p.cache(tokenized_cases) p.save("/tmp/jjsjsj.gz") p.load("/tmp/jjsjsj.gz") print p.parse(tk.tokenize("it will work (so it is)"))
def __init__(self, tokenizer=None, vec=None): self._vec = vec if vec else Word2VecRepresentation() self._tokenizer = tokenizer if tokenizer else DefaultTokenizer() self._kwex = DefaultKeywordExtractor() self._lem = DefaultLemmatizer() self._recaser = DefaultRecaser() self._data = []
class FeaturePreProcessor(object): def __init__(self, num_features=True): self._tokenizer = DefaultTokenizer() self.num_features = num_features def _get_num_feature(self, s1, s2): nums_1, nums_2 = set(), set() for t1 in self._tokenizer.tokenize(s1): if re.match(r"^[0-9.]+$", t1): nums_1.add(t1) for t2 in self._tokenizer.tokenize(s2): if re.match(r"^[0-9.]+$", t2): nums_2.add(t1) feat = [0, 0, 0] if nums_1 == nums_2 or (not nums_1 and not nums_2): feat[0] = 1 for n1 in nums_1: if n1 in nums_2: feat[1] = 1 break if nums_1 != nums_2 and (nums_1.issubset(nums_2) or nums_2.issubset(nums_1)): feat[2] = 1 return feat def preprocess(self, data): sent1, sent2, label, input = data # Normalize # input = (input + (input > 10) * (10 - input)) / 10 - 0.5 input = (input - np.mean(input)) / np.sqrt(np.var(input)) input = input.flatten() if self.num_features: input = np.concatenate([input.flatten(), np.array(self._get_num_feature(sent1, sent2))]) return [input], [label] def preprocess_nolabel(self, sent1, sent2, input): # Normalize input = (input + (input > 10) * (10 - input)) / 10 - 0.5 input = input.flatten() if self.num_features: input = np.concatenate([input.flatten(), np.array(self._get_num_feature(sent1, sent2))]) return [input]
def __init__(self, num_features=True): self._tokenizer = DefaultTokenizer() self.num_features = num_features
right = "[%s]" % right new_output = register_used.index(False) if new_output != output: rewrite_rules[output] = new_output output = new_output register_used[output] = True yield left, right, output def __iter__(self): return self._recude_number(self._optimize(self._sequence())) if __name__ == '__main__': tokenizer = DefaultTokenizer() sent_list = [x.strip() for x in sys.stdin.xreadlines()] tok_list = [tokenizer.tokenize(x) for x in sent_list] if len(sys.argv) == 2: parser = BatchStanfordCFGParser() parser.load_output(tok_list, sys.argv[1]) else: parser = StanfordCFGParser() testcase = "A discouraging outlook from General Electric Co. sent its share down 81 cents (U.S.) or 2.7 per cent to $29.32." #sys.stderr = StringIO.StringIO() reload(sys) sys.setdefaultencoding("utf-8")
def serve(param): from nlpy.basic import DefaultTokenizer output = FrequencyKeywordExtractor().extract( DefaultTokenizer().tokenize(param["input"])) return {"output": output}
def __init__(self): self._kwex = DefaultKeywordExtractor() self._lem = DefaultLemmatizer() self._recaser = DefaultRecaser() self._tokenizer = DefaultTokenizer()
else: right = "[%s]" % right new_output = register_used.index(False) if new_output != output: rewrite_rules[output] = new_output output = new_output register_used[output] = True yield left, right, output def __iter__(self): return self._recude_number(self._optimize(self._sequence())) if __name__ == '__main__': tokenizer = DefaultTokenizer() sent_list = [x.strip() for x in sys.stdin.xreadlines()] tok_list = [tokenizer.tokenize(x) for x in sent_list] if len(sys.argv) == 2: parser = BatchStanfordCFGParser() parser.load_output(tok_list, sys.argv[1]) else: parser = StanfordCFGParser() testcase = "A discouraging outlook from General Electric Co. sent its share down 81 cents (U.S.) or 2.7 per cent to $29.32." #sys.stderr = StringIO.StringIO() reload(sys) sys.setdefaultencoding("utf-8")
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2015 NLPY.ORG # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html from nlpy.basic import DefaultTokenizer import sys, os if __name__ == '__main__': tok = DefaultTokenizer() for l in sys.stdin.xreadlines(): l = l.strip() print " ".join(tok.tokenize(l))
class ParaphraseEncoder(object): def __init__(self, network, vec=None, parser=None, pooling_size=15, regs_allowed=5): self._vec = vec self._network = network self._parser = StanfordCFGParser() if not parser else parser self._tokenizer = DefaultTokenizer() self._recaser = DefaultRecaser() self.pooling_size = pooling_size self.regs_allowed = regs_allowed def encode(self, text, tokenized=False): if tokenized: toks = text else: toks = self._tokenizer.tokenize(text) if len(toks) <= 1: return [self._get_word_vec(t) for t in toks] else: tree = self._parser.parse(toks) seq = list(CFGSequencer(tree)) if max([x[2] for x in seq]) >= self.regs_allowed: return None token_data, seq_data = self._build_data(seq) return self._network.convert(token_data, seq_data) def _build_data(self, seq): tokens = [] sequence = [] max_reg = 0 for left, right, target in seq: if type(left) != int: tokens.append(left[1:-1]) left = - len(tokens) if type(right) != int: tokens.append(right[1:-1]) right = - len(tokens) sequence.append((left, right, target)) if max(left, right) > max_reg: max_reg = max(left, right) token_data = [np.zeros(300, dtype='float32')] for tok in tokens: tok_vec = self._get_word_vec(tok) token_data.append(tok_vec) return token_data, sequence def _get_word_vec(self, tok): tok = self._recaser.recase(tok) if tok not in self._vec._model.vocab: tok_vec = np.zeros(300, dtype='float32') else: tok_id = self._vec._model.vocab[tok].index tok_vec = self._vec._model.syn0norm[tok_id].astype('float32') return tok_vec def _distance(self, rep1, rep2): return np.sqrt(np.sum((rep1 - rep2)**2)) def _min_block(self, matrix, x_begin, x_end, y_begin, y_end): min_value = matrix[x_begin][y_begin] for x in range(x_begin, x_end): for y in range(y_begin, y_end): val = matrix[x][y] if val < min_value: min_value = val return min_value def dynamic_pool(self, reps1, reps2): # Initialize matrices sim_matrix = [] for _, rep1 in enumerate(reps1): sims = [] for _, rep2 in enumerate(reps2): sims.append(self._distance(rep1, rep2)) sim_matrix.append(sims) pooling_matrix = [] for _ in range(self.pooling_size): pooling_matrix.append([0]*self.pooling_size) # Pooling h_span = float(len(reps1)) / self.pooling_size v_span = float(len(reps2)) / self.pooling_size for i in range(self.pooling_size): for j in range(self.pooling_size): min_val = self._min_block(sim_matrix, int(i*h_span), int((i+1)*h_span), int(j*v_span), int((j+1)*v_span)) pooling_matrix[i][j] = min_val return np.array(pooling_matrix) def make_pooling_matrix(self, text1, text2, reps1=None, reps2=None): toks1, toks2 = map(self._tokenizer.tokenize, (text1, text2)) tok_reps1 = np.array(map(self._get_word_vec, toks1)) tok_reps2 = np.array(map(self._get_word_vec, toks2)) reps1 = self.encode(toks1, tokenized=True) if reps1 is None else reps1 reps2 = self.encode(toks1, tokenized=True) if reps2 is None else reps2 if reps1 == None or reps2 == None: return None pooling_matrix = self.dynamic_pool(np.concatenate((tok_reps1, reps1)), np.concatenate((tok_reps2, reps2))) return pooling_matrix def detect(self, text1, text2): return self.make_pooling_matrix(text1, text2)