Exemple #1
0
class KeywordExtractor(object):

    def __init__(self):
        self._kwex = DefaultKeywordExtractor()
        self._lem = DefaultLemmatizer()
        self._recaser = DefaultRecaser()
        self._tokenizer = DefaultTokenizer()

    def extract(self, sent):
        keywords = self._kwex.extract(
                map(self._recaser.recase,
                    map(self._lem.lemmatize,
                        map(str.lower,
                            self._tokenizer.tokenize(sent))))
            )
        return keywords

    def extract_weighted(self, sent):
        keywords = self._kwex.extract_weighted(
                map(self._recaser.recase,
                    map(self._lem.lemmatize,
                        map(str.lower,
                            self._tokenizer.tokenize(sent))))
            )
        return keywords

    @staticmethod
    def serve(params):
        global keyword_extractor
        if "keyword_extractor" not in globals():
            keyword_extractor = KeywordExtractor()

        return {"output": keyword_extractor.extract(params['input'])}
Exemple #2
0
    def _test_terminals(self):
        testcase = "i cook rice."

        tk = DefaultTokenizer()
        p = StanfordCFGParser()
        tree = p.parse(tk.tokenize(testcase))
        print p.extract_terminals(tree)
Exemple #3
0
    def _test_terminals(self):
        testcase = "i cook rice."

        tk = DefaultTokenizer()
        p = StanfordCFGParser()
        tree = p.parse(tk.tokenize(testcase))
        print p.extract_terminals(tree)
Exemple #4
0
 def __init__(self, network, vec=None, parser=None, pooling_size=15, regs_allowed=5):
     self._vec = vec
     self._network = network
     self._parser = StanfordCFGParser() if not parser else parser
     self._tokenizer = DefaultTokenizer()
     self._recaser = DefaultRecaser()
     self.pooling_size = pooling_size
     self.regs_allowed = regs_allowed
Exemple #5
0
    def _test_parse(self):
        testcase = "One difference from C: I wrote a little wrapper around malloc/free, cymem."

        tk = DefaultTokenizer()

        p = StanfordCFGParser()

        tree = p.parse(tk.tokenize(testcase))
        print tree
Exemple #6
0
    def _test_parse(self):
        testcase = "One difference from C: I wrote a little wrapper around malloc/free, cymem."

        tk = DefaultTokenizer()

        p = StanfordCFGParser()

        tree = p.parse(tk.tokenize(testcase))
        print tree
Exemple #7
0
    def test_bath_parse(self):
        tk = DefaultTokenizer()
        p = BatchStanfordCFGParser()
        testcases = ["it turns out good", "it will work (so it is)"]
        tokenized_cases = []
        for case in testcases:
            tokenized_cases.append(tk.tokenize(case))

        p.cache(tokenized_cases)
        p.save("/tmp/jjsjsj.gz")
        p.load("/tmp/jjsjsj.gz")
        print p.parse(tk.tokenize("it will work (so it is)"))
Exemple #8
0
    def test_bath_parse(self):
        tk = DefaultTokenizer()
        p = BatchStanfordCFGParser()
        testcases = ["it turns out good", "it will work (so it is)"]
        tokenized_cases = []
        for case in testcases:
            tokenized_cases.append(tk.tokenize(case))

        p.cache(tokenized_cases)
        p.save("/tmp/jjsjsj.gz")
        p.load("/tmp/jjsjsj.gz")
        print p.parse(tk.tokenize("it will work (so it is)"))
Exemple #9
0
 def __init__(self, tokenizer=None, vec=None):
     self._vec = vec if vec else Word2VecRepresentation()
     self._tokenizer = tokenizer if tokenizer else DefaultTokenizer()
     self._kwex = DefaultKeywordExtractor()
     self._lem = DefaultLemmatizer()
     self._recaser = DefaultRecaser()
     self._data = []
Exemple #10
0
class FeaturePreProcessor(object):

    def __init__(self, num_features=True):
        self._tokenizer = DefaultTokenizer()
        self.num_features = num_features

    def _get_num_feature(self, s1, s2):
        nums_1, nums_2 = set(), set()
        for t1 in self._tokenizer.tokenize(s1):
            if re.match(r"^[0-9.]+$", t1):
                nums_1.add(t1)
        for t2 in self._tokenizer.tokenize(s2):
            if re.match(r"^[0-9.]+$", t2):
                nums_2.add(t1)
        feat = [0, 0, 0]
        if nums_1 == nums_2 or (not nums_1 and not nums_2):
            feat[0] = 1
        for n1 in nums_1:
            if n1 in nums_2:
                feat[1] = 1
                break
        if nums_1 != nums_2 and (nums_1.issubset(nums_2) or nums_2.issubset(nums_1)):
            feat[2] = 1
        return feat

    def preprocess(self, data):
        sent1, sent2, label, input = data
        # Normalize
        # input = (input + (input > 10) * (10 - input)) / 10 - 0.5
        input = (input - np.mean(input)) / np.sqrt(np.var(input))
        input = input.flatten()
        if self.num_features:
            input = np.concatenate([input.flatten(), np.array(self._get_num_feature(sent1, sent2))])
        return [input], [label]

    def preprocess_nolabel(self, sent1, sent2, input):
        # Normalize
        input = (input + (input > 10) * (10 - input)) / 10 - 0.5
        input = input.flatten()
        if self.num_features:
            input = np.concatenate([input.flatten(), np.array(self._get_num_feature(sent1, sent2))])
        return [input]
Exemple #11
0
 def __init__(self, num_features=True):
     self._tokenizer = DefaultTokenizer()
     self.num_features = num_features
Exemple #12
0
                right = "[%s]" % right
            new_output = register_used.index(False)
            if new_output != output:
                rewrite_rules[output] = new_output
                output = new_output
            register_used[output] = True
            yield left, right, output

    def __iter__(self):
        return self._recude_number(self._optimize(self._sequence()))



if __name__ == '__main__':

    tokenizer = DefaultTokenizer()

    sent_list = [x.strip() for x in sys.stdin.xreadlines()]
    tok_list = [tokenizer.tokenize(x) for x in sent_list]

    if len(sys.argv) == 2:
        parser = BatchStanfordCFGParser()
        parser.load_output(tok_list, sys.argv[1])
    else:
        parser = StanfordCFGParser()


    testcase = "A discouraging outlook from General Electric Co. sent its share down 81 cents (U.S.) or 2.7 per cent to $29.32."
    #sys.stderr = StringIO.StringIO()
    reload(sys)
    sys.setdefaultencoding("utf-8")
 def serve(param):
     from nlpy.basic import DefaultTokenizer
     output = FrequencyKeywordExtractor().extract(
         DefaultTokenizer().tokenize(param["input"]))
     return {"output": output}
Exemple #14
0
 def __init__(self):
     self._kwex = DefaultKeywordExtractor()
     self._lem = DefaultLemmatizer()
     self._recaser = DefaultRecaser()
     self._tokenizer = DefaultTokenizer()
Exemple #15
0
            else:
                right = "[%s]" % right
            new_output = register_used.index(False)
            if new_output != output:
                rewrite_rules[output] = new_output
                output = new_output
            register_used[output] = True
            yield left, right, output

    def __iter__(self):
        return self._recude_number(self._optimize(self._sequence()))


if __name__ == '__main__':

    tokenizer = DefaultTokenizer()

    sent_list = [x.strip() for x in sys.stdin.xreadlines()]
    tok_list = [tokenizer.tokenize(x) for x in sent_list]

    if len(sys.argv) == 2:
        parser = BatchStanfordCFGParser()
        parser.load_output(tok_list, sys.argv[1])
    else:
        parser = StanfordCFGParser()

    testcase = "A discouraging outlook from General Electric Co. sent its share down 81 cents (U.S.) or 2.7 per cent to $29.32."
    #sys.stderr = StringIO.StringIO()
    reload(sys)
    sys.setdefaultencoding("utf-8")
Exemple #16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015 NLPY.ORG
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from nlpy.basic import DefaultTokenizer
import sys, os

if __name__ == '__main__':
    tok = DefaultTokenizer()
    for l in sys.stdin.xreadlines():
        l = l.strip()
        print " ".join(tok.tokenize(l))
Exemple #17
0
class ParaphraseEncoder(object):

    def __init__(self, network, vec=None, parser=None, pooling_size=15, regs_allowed=5):
        self._vec = vec
        self._network = network
        self._parser = StanfordCFGParser() if not parser else parser
        self._tokenizer = DefaultTokenizer()
        self._recaser = DefaultRecaser()
        self.pooling_size = pooling_size
        self.regs_allowed = regs_allowed

    def encode(self, text, tokenized=False):
        if tokenized:
            toks = text
        else:
            toks = self._tokenizer.tokenize(text)
        if len(toks) <= 1:
            return [self._get_word_vec(t) for t in toks]
        else:
            tree = self._parser.parse(toks)
            seq = list(CFGSequencer(tree))
            if max([x[2] for x in seq]) >= self.regs_allowed:
                return None
            token_data, seq_data = self._build_data(seq)
            return self._network.convert(token_data, seq_data)

    def _build_data(self, seq):
        tokens = []
        sequence = []
        max_reg = 0
        for left, right, target in seq:
            if type(left) != int:
                tokens.append(left[1:-1])
                left = - len(tokens)
            if type(right) != int:
                tokens.append(right[1:-1])
                right = - len(tokens)
            sequence.append((left, right, target))
            if max(left, right) > max_reg:
                max_reg = max(left, right)

        token_data = [np.zeros(300, dtype='float32')]
        for tok in tokens:
            tok_vec = self._get_word_vec(tok)
            token_data.append(tok_vec)
        return token_data, sequence

    def _get_word_vec(self, tok):
        tok = self._recaser.recase(tok)
        if tok not in self._vec._model.vocab:
            tok_vec = np.zeros(300, dtype='float32')
        else:
            tok_id = self._vec._model.vocab[tok].index
            tok_vec = self._vec._model.syn0norm[tok_id].astype('float32')
        return tok_vec

    def _distance(self, rep1, rep2):
        return np.sqrt(np.sum((rep1 - rep2)**2))

    def _min_block(self, matrix, x_begin, x_end, y_begin, y_end):
        min_value = matrix[x_begin][y_begin]
        for x in range(x_begin, x_end):
            for y in range(y_begin, y_end):
                val = matrix[x][y]
                if val < min_value:
                    min_value = val
        return min_value

    def dynamic_pool(self, reps1, reps2):
        # Initialize matrices
        sim_matrix = []
        for _, rep1 in enumerate(reps1):
            sims = []
            for _, rep2 in enumerate(reps2):
                sims.append(self._distance(rep1, rep2))
            sim_matrix.append(sims)
        pooling_matrix = []
        for _ in range(self.pooling_size):
            pooling_matrix.append([0]*self.pooling_size)
        # Pooling
        h_span = float(len(reps1)) / self.pooling_size
        v_span = float(len(reps2)) / self.pooling_size
        for i in range(self.pooling_size):
            for j in range(self.pooling_size):
                min_val = self._min_block(sim_matrix,
                                          int(i*h_span), int((i+1)*h_span),
                                          int(j*v_span), int((j+1)*v_span))
                pooling_matrix[i][j] = min_val
        return np.array(pooling_matrix)

    def make_pooling_matrix(self, text1, text2, reps1=None, reps2=None):
        toks1, toks2 = map(self._tokenizer.tokenize, (text1, text2))
        tok_reps1 = np.array(map(self._get_word_vec, toks1))
        tok_reps2 = np.array(map(self._get_word_vec, toks2))
        reps1 = self.encode(toks1, tokenized=True) if reps1 is None else reps1
        reps2 = self.encode(toks1, tokenized=True) if reps2 is None else reps2
        if reps1 == None or reps2 == None:
            return None
        pooling_matrix = self.dynamic_pool(np.concatenate((tok_reps1, reps1)), np.concatenate((tok_reps2, reps2)))
        return pooling_matrix

    def detect(self, text1, text2):
        return self.make_pooling_matrix(text1, text2)