Beispiel #1
0
    def __init__(self, template_file):
        self._rules = []

        for line in iter_file(template_file):
            if not line:
                continue

            template = line.split('#')[0].strip()

            if not template:
                continue

            groups = re.findall(r'^(.+)\((.+)_(.+), (.+)_(.+)\)$', template)

            if groups:
                rule, token1_pos, token1_type, token2_pos, token2_type = groups[
                    0]

                if token1_type == 'm':
                    modifier_order = 1
                    token_order = 2
                else:
                    modifier_order = 2
                    token_order = 1

                rule_format = '%s(%s, %s)' % (rule, token1_pos, token2_pos)
                self._rules.append((rule_format, modifier_order, token_order))
Beispiel #2
0
 def __iter__(self):
     for line in utils.iter_file(self.__source):
         yield [
             tp[0]
             for tp in re.findall(r'(\S+)%s(\S+)' %
                                  WORD_POS_SEPARATOR, line) if tp[1] != 'PU'
         ]
Beispiel #3
0
    def test_label_eval(self):
        self.assertTrue(True)

        import re
        import os

        from nlp.config import RESOURCE_DIR
        from common.utils import iter_file

        total = 0
        extract = 0
        right = 0
        for line in iter_file(
                os.path.join(RESOURCE_DIR, 'tmp', 'labels.result.txt')):
            if not line:
                continue

            xx = re.findall(r'\[(\d) (\d) (\d)\]', line)
            if xx:
                print(xx)
                nums = xx[0]
                total += int(nums[0])
                extract += int(nums[1])
                right += int(nums[2])

        print('total: {}, extract: {}, right: {}'.format(
            total, extract, right))
        print('准确率:{}, 召回率:{}'.format(1.0 * right / extract,
                                      1.0 * right / total))
Beispiel #4
0
def clean_file(source_file, dest_file):
    logger.info('clean pinglun run...')

    with codecs.open(dest_file, 'w', encoding='utf-8') as f:
        for line in iter_file(source_file):
            for sent in clean_txt(line):
                f.write('%s\n' % sent)
Beispiel #5
0
    def test_labelExtractor_batch(self):
        self.assertTrue(True)

        feature_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                                    '_result', 'features.revised')
        opinion_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                                    '_result', 'opinions.revised')

        label_extractor = LabelExtractor(feature_file,
                                         opinion_file,
                                         sentence_prob_threshold=-10)
        '''
        labels = label_extractor.extract_from_txt(txt)
        for label in labels:
            print(label)
        '''

        counter = Counter()
        results = []
        comment_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                                    'sbracelet.txt')
        for i, line in enumerate(utils.iter_file(comment_file)):

            print(i)

            if i > 100:
                break

            # 句法解析
            txts = clean.clean_txt2(line)
            relations = []
            for txt in txts:
                sentences = label_extractor.preprocess(txt)
                for sentence in sentences:
                    sent = parser.parse2sents(sentence)[0]
                    relation = ' '.join([str(r) for r in sent.relations])
                    relations.append(relation)

            # 提取标签
            labels = label_extractor.extract_from_txt(line)
            for label in labels:
                fo = label.feature + label.opinion
                counter.update([fo])

            # print(line, '->', labels)
            results.append(line)
            results.append('->')
            results += relations
            results.append('->')
            for label in labels:
                results.append(str(label))
            results.append('')

        utils.write_file(
            os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                         'labels.result.txt'), results)

        for fo, c in counter.most_common():
            print(fo, c)
Beispiel #6
0
    def test_general_polar(self):
        self.assertTrue(True)

        opinion_file = os.path.join(RESOURCE_DIR, 'dp', 'dp.opinions')
        for word in utils.iter_file(opinion_file):

            polar = lexicon.get_polar(word)
            if polar == 'x':
                print(word, polar)
Beispiel #7
0
    def __init__(self, template_file):
        self._rules = []

        for line in iter_file(template_file):
            if not line:
                continue

            template = line.split('#')[0].strip()
            self._rules.append(template)
Beispiel #8
0
    def test_x1(self):
        self.assertTrue(True)

        lines = []
        for i, line in enumerate(utils.iter_file(os.path.join(RESOURCE_DIR, 'mobile', 'std.txt'))):
            if i < 50000:
                lines.append(line)

        utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'std.5w.txt'), lines)
Beispiel #9
0
def build_lm_train_data(raw_data_file, hanzi_data_file, pnyin_data_file):
    """
    构建Language Model训练语料
    :param raw_data_file:
    :param hanzi_data_file:
    :param pnyin_data_file:
    """
    SYMBOL_ENG = '<eng>'
    SYMBOL_NUM = '<num>'
    SYMBOL_ENG_NUM = '<engnum>'

    with codecs.open(hanzi_data_file, mode='w', encoding='utf-8') as hf,\
            codecs.open(pnyin_data_file, mode='w', encoding='utf-8') as pf:

        j = 0
        for i, line in enumerate(iter_file(raw_data_file)):
            # if i % 10000 == 0:
            #     print(i)

            if j > 24713125:
                break

            for sent in re.split(r'[,。?!?,]', line):

                tokens = tag_pinyin(sent)

                words = []
                pnyins = []
                for tp in tokens:
                    word = tp[0]
                    pnyin = tp[1]

                    if re.match(r'^[a-zA-Z]+$', word):
                        word = SYMBOL_ENG

                    if re.match(r'^[0-9]+$', word):
                        word = SYMBOL_NUM

                    if re.match(r'^[a-zA-Z0-9]+$', word):
                        word = SYMBOL_ENG_NUM

                    words.append(word)

                    pnyin = pnyin if pnyin else word
                    pnyins.append(pnyin)

                # if words:
                #     hf.write('{}\n'.format(' '.join(words)))

                if pnyins:
                    j += 1
                    if j % 10000 == 0:
                        print(j)

                    pf.write('{}\n'.format(' '.join(pnyins)))
Beispiel #10
0
    def test_count_syntax(self):
        self.assertTrue(True)

        sentiments = load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1正面评价词_a+.txt'))
        sentiments |= load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1负面评价词_a-.txt'))
        features = load_feature_word(os.path.join(RESOURCE_DIR, 'mobile', 'mobile.ontology'))

        corpus_file = os.path.join(RESOURCE_DIR, 'mobile', 'std.txt')

        ff_counter = Counter()
        oo_counter = Counter()
        fo_counter = Counter()

        ff_samples = defaultdict(set)
        oo_samples = defaultdict(set)
        fo_samples = defaultdict(set)

        i = 0
        for line in utils.iter_file(corpus_file):
            i += 1

            if i % 100 == 0:
                print(i)

            if i > 200000:
                break

            for sent in parser.parse2sents(line):
                for relation in sent.relations:
                    token1 = relation.token1.word
                    token2 = relation.token2.word

                    if token1 in features and token2 in features:
                        ff_counter.update([relation.format])
                        ff_samples[relation.format].add(str(relation))

                    if token1 in sentiments and token2 in sentiments:
                        oo_counter.update([relation.format])
                        oo_samples[relation.format].add(str(relation))

                    if token1 in sentiments and token2 in features:
                        fo_counter.update([relation.format])
                        fo_samples[relation.format].add(str(relation))

                    if token1 in features and token2 in sentiments:
                        fo_counter.update([relation.format])
                        fo_samples[relation.format].add(str(relation))

        utils.save_obj(ff_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.counter'))
        utils.save_obj(oo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.counter'))
        utils.save_obj(fo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.counter'))

        utils.save_obj(ff_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.dict'))
        utils.save_obj(oo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.dict'))
        utils.save_obj(fo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.dict'))
Beispiel #11
0
    def iter_corpus(pdict, base_dir_):
        for f in os.listdir(base_dir_):
            f = os.path.join(base_dir_, f)

            if os.path.isdir(f):
                iter_corpus(pdict, f)
            else:
                for line in iter_file(f):
                    for w, p in re.findall(pattern, line):
                        p = PosNormalizer.normalize(p)
                        pdict[w][p] += 1
Beispiel #12
0
        def iter_corpus(poses_, counts_, base_dir_):
            for f in os.listdir(base_dir_):
                f = os.path.join(base_dir_, f)

                if os.path.isdir(f):
                    iter_corpus(poses_, f)
                else:
                    for line in iter_file(f):
                        for w, p in re.findall(pattern, line):
                            poses_[w].add(p)
                            counts_[w] += 1
Beispiel #13
0
    def __init__(self, lexicon_file):
        degrees = defaultdict(set)

        __current_degree = ''
        for word in utils.iter_file(lexicon_file):
            if word.startswith('['):
                word = word.replace('[', '').replace(']', '')
                __current_degree = word

            degrees[__current_degree].add(word)

        self._degrees = degrees
Beispiel #14
0
    def extract_from_file(self, txt_file):
        """
        提取标签,输入源是一个文本文件
        :param txt_file:
        :return: [Label, Label, ...]
        """
        labels = []

        for line in utils.iter_file(txt_file):
            labels += self.extract_from_txt(line)

        return labels
Beispiel #15
0
    def test_compare_pos(self):
        self.assertTrue(True)

        wd = dict()

        jieba_vocab_file = 'D:\\soft\\anaconda2\\envs\\clabel\\Lib\\site-packages\\jieba\\dict.txt'
        for line in iter_file(jieba_vocab_file):
            word, freq, tag = line.split()
            wd[word] = tag

        jieba_user1_vocab_file = 'D:\\workspace\\pycharm\\clabel\\zresource\\nlp\lexicon\\jieba\\user1.dict'
        for line in iter_file(jieba_user1_vocab_file):
            word, freq, tag = line.split()
            wd[word] = tag

        ds = degreeLexicon.items
        adjs = fixedSentimentLexicon.items

        print('--------------degree-------------------')
        for d in ds:
            if d in wd and wd[d] != 'd':
                # print(d, wd[d])
                pass

        print('--------------adj-------------------')
        for a in adjs:
            if a not in wd:
                # print('-miss- ', a, 10000, 'a')
                pass

            elif wd[a] != 'a' and wd[a][0] in ['n', 'i']:
                # print(a, 10000, 'a')
                pass

        print('--------------d + adj-------------------')
        for d, a in product(ds, adjs):
            x = d + a
            if x in wd and wd[x] != 'a':
                print(x, wd[x])
                pass
Beispiel #16
0
    def create_standard_dataset(self):
        """
        读取原始的文本,清洗后,提取出规范句子,存储到文件
        :return:
        """
        sentences = []

        for line in iter_file(self._raw_file):
            txt = clean.clean_txt(line)
            sents = clean.extract_standard_sentences(txt)
            sentences += [sent for sent in sents if clean.is_meaningful(sent)]

        write_file(self._clean_file, sentences)
Beispiel #17
0
    def _build_train_data(self):
        VOCAB_SIZE = len(self._ctable)
        MAX_SENTENCE_LENGTH = self._max_sentence_length

        SAMPLE_NUMBER = 0
        for line in iter_file(self._label_file):
            if line:
                SAMPLE_NUMBER += 1

        logger.info('Vectorization...')
        X = np.zeros((SAMPLE_NUMBER, MAX_SENTENCE_LENGTH, VOCAB_SIZE),
                     dtype=np.int8)
        y = np.zeros((SAMPLE_NUMBER, MAX_SENTENCE_LENGTH, VOCAB_SIZE),
                     dtype=np.int8)

        for i, line in enumerate(iter_file(self._label_file)):
            if not line:
                continue

            sentence, sequence = line.split('\t')

            X[i] = self._ctable.encode(sentence.split(), MAX_SENTENCE_LENGTH)
            y[i] = self._ctable.encode(sequence.split(), MAX_SENTENCE_LENGTH)

        logger.info('Shuffle...')
        indices = np.arange(len(y))
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]

        # Explicitly set apart 10% for validation data that we never train over.
        # split_at = len(X) - len(X) // 10
        # (X_train, x_val) = X[:split_at], X[split_at:]
        # (y_train, y_val) = y[:split_at], y[split_at:]
        X_train, y_train = X, y

        return X_train, y_train
Beispiel #18
0
    def test_normalize_revise_file(self):
        self.assertTrue(True)

        import html.parser

        html_parser = html.parser.HTMLParser()

        def tokens2str(tokens):
            return ' '.join(
                ['%s/%s' % (token.word, token.pos) for token in tokens])

        ss = []

        sb_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                               'sbracelet.txt')
        for i, line in enumerate(iter_file(sb_file)):
            print(i)

            # 还原html转义字符,&hellip; => ……
            line = html_parser.unescape(line)

            # Todo HTMLParser不能讲&#039;转换,而单独测试时是可以的,不知为何。。
            line = line.replace('&#039;', '\'')

            for sentence in combParser.ssplit(line):
                tokens = combParser.pos(sentence, revise=False)
                s1 = tokens2str(tokens)
                # ss.append('jba1- ' + s1)

                tokens = combParser.pos(sentence, revise=True)
                s2 = tokens2str(tokens)
                # ss.append('jba2- ' + s2)

                if s1 != s2:
                    ss.append('jba1- ' + s1)
                    ss.append('jba2- ' + s2)

                # tokens = ltpParser.pos(line)
                # ss.append('ltp1- ' + tokens2str(tokens))
                #
                # PosReviser.revise(tokens)
                # ss.append('ltp2- ' + tokens2str(tokens))

            if i > 1000:
                break

        write_file(
            os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                         'sbracelet.pos.1.txt'), ss)
Beispiel #19
0
def load_feature_word(file_path):
    words = set()

    for line in utils.iter_file(file_path):
        line = line.strip()
        if not line:
            continue

        groups = re.findall(r'^(\S+) \S+ \S+ \S+ \[([^\[\]]*)\].*$', line)[0]
        words.add(groups[0])
        for word in groups[1].split():
            if word:
                words.add(word)

    return words
Beispiel #20
0
def _init_revise_map(pos_revise_file):
    revise_map = dict()

    for line in iter_file(pos_revise_file):
        word = line.split(' ')[0]

        # 过滤掉出现次数过低的词性
        pcs = {
            p: int(c)
            for p, c in re.findall(r'([a-zA-Z]+)=(\d+)', line) if int(c) > 1
        }
        total = sum(pcs.values())
        revise_map[word] = {p: 1.0 * c / total for p, c in pcs.items()}
        revise_map = {w: s for w, s in revise_map.items() if len(s) > 1}

    return revise_map
Beispiel #21
0
    def create_train_dataset(self):
        """
        标注训练数据,分字->标注
        <E>表示句子的最后一个字,<M>表示句子的非最后一个字

        特别好,发货很快,赞。 => <M> <M> <E> <M> <M> <M> <E> <E> 。
        :return:
        """
        lines = []

        for line in iter_file(self._clean_file):
            result = Labeler.label(line)
            token = ' '.join([t for t, _ in result])
            sequence = ' '.join([seq for _, seq in result])
            lines.append('%s\t%s' % (token, sequence))

        write_file(self._label_file, lines)
Beispiel #22
0
    def test_sbd_file(self):
        self.assertTrue(True)

        model = SBDModel.load(
            keras_model_file=os.path.join(APP_RESOURCE_DIR, 'sbd.keras.model'))

        lines = []
        for line in iter_file(
                os.path.join(RESOURCE_DIR, 'tmp', 'comment.mobile.txt')):
            words = re.findall(r'[a-zA-Z0-9\u4e00-\u9fa5]', line)
            sent = ''.join(words)
            # sequence = model.predict_sequence(sent)
            pline = model.predict_txt(sent)
            lines.append('{} -> {}'.format(line, pline))
            print('{} -> {}'.format(line, pline))

        write_file(os.path.join(RESOURCE_DIR, 'tmp', 'sbd.result.txt'), lines)
Beispiel #23
0
def load_sentiment_words(file_path):
    words = set()

    for line in utils.iter_file(file_path):
        line = line.strip()
        if not line or line.startswith('----'):
            continue

        for word in line.split():
            word = word.strip()
            if not word:
                continue

            word = re.sub(r'\(\d+\)', '', word)
            words.add(word)

    return words
Beispiel #24
0
    def __init__(self, vocab_file):
        self._pinyin2chars = defaultdict(set)
        self._counter = Counter()
        self._char2pinyin = dict()

        i = 0
        for line in iter_file(vocab_file):
            i += 1
            if i % 10000 == 0:
                print(i)

            if i > 1000000:
                break

            for c, p in tag_pinyin(line):
                self._pinyin2chars[p].add(c)
                self._char2pinyin[c] = p

                self._counter.update([c])
Beispiel #25
0
    def test_sbd_eval(self):
        self.assertTrue(True)

        t = 0
        for line in iter_file(
                os.path.join(RESOURCE_DIR, 'tmp', 'sbd.result.txt')):
            txt1, txt2 = line.split('->')
            txt1 = txt1.strip()
            txt2 = txt2.strip()

            sents1 = parser.ssplit(txt1)
            sents2 = parser.ssplit(txt2)

            if sents1 == sents2:
                t += 1
            else:
                print(line)

        print('true:', t)
Beispiel #26
0
def find_by_rule(corpus_file, reg_rules, max_lines):
    counters = []
    for _ in range(len(reg_rules)):
        counters.append(Counter())

    i = 0
    for line in utils.iter_file(corpus_file):
        i += 1

        if i % 100 == 0:
            print(i)

        if i > max_lines:
            break

        txt = ' '.join(parser.segment(line))

        for counter, reg_rule in zip(counters, reg_rules):
            groups = re.findall(reg_rule, txt)
            if groups:
                word = groups[0]

                # # 过滤掉单字,单字词性往往不固定,不好确定
                # if len(word) < 2:
                #     continue

                # 过滤掉标点符号
                if word in [',', '。', '!', '!', '?', '?']:
                    continue

                # 过滤掉纯数字
                if re.match(r'\d+', word):
                    continue

                # 过滤掉无关词
                if degreeLexicon.is_degree(
                        word) or irrelevantLexicon.is_irrelevant_word(word):
                    continue

                counter.update([word])

    return counters
Beispiel #27
0
def run_test():
    lm = BaseLM(os.path.join(LM_MODEL_DIR, 'hanzi.arpa'))

    from common.utils import iter_file
    from common.utils import write_file

    probs = []
    for line in iter_file(os.path.join(RESOURCE_DIR, 'tmp',
                                       'comment.test.txt')):
        for sent in re.split(r'[,。?!?,]', line):
            words = re.findall(r'[a-zA-Z0-9\u4e00-\u9fa5]', sent)
            sent = ''.join(words)
            if sent:
                prob = lm.predict_prob(sent)
                probs.append((sent, prob))

    sort_probs = sorted(probs, key=lambda tp: tp[1])

    write_file(os.path.join(RESOURCE_DIR, 'tmp', 'result.txt'),
               ['{} {}'.format(p, s) for s, p in sort_probs])
Beispiel #28
0
    def test_correct2(self):
        self.assertTrue(True)

        txts = []

        for i, line in enumerate(iter_file(os.path.join(RESOURCE_DIR, 'tmp', 'comment.mobile.tiny.txt'))):

            if i % 100 == 0:
                print(i)

            txt = std.extract_txt(line)

            sents = []
            for sent in parser.ssplit(line):

                # 提取中文、英文、数字
                sent = std.extract_txt(sent)

                if not sent:
                    continue

                '''纠错,只对中文纠错'''
                if not re.findall(r'[a-zA-Z0-9]', sent):
                    csent = std.wed(sent)
                    if sent != csent:
                        sent_prob = std.prob(sent)
                        csent_prob = std.prob(csent)

                        # 新文本的概率大于旧文本,即纠错
                        if csent_prob > sent_prob:
                            sent = csent

                sents.append(sent)

            ctxt = ''.join(sents)
            if ctxt != txt:
                txts.append('{} -> {}'.format(txt, ctxt))

        write_file(os.path.join(RESOURCE_DIR, 'tmp', 'correct.result.txt'), txts)
Beispiel #29
0
    def build(cls, corpus_file, special_chars=[]):
        ctable = CharacterTable()

        chars = set()
        for line in iter_file(corpus_file):
            chars |= set(Tokenizer.token(line))

        chars.add(' ')

        if special_chars:
            for sc in special_chars:
                chars.add(sc)

        # TODO UNK WORD
        chars.add(CharacterTable.SYMBOL_UNK)

        ctable.chars = sorted(chars)

        ctable.char_indices = dict((c, i) for i, c in enumerate(ctable.chars))
        ctable.indices_char = dict((i, c) for i, c in enumerate(ctable.chars))

        return ctable
Beispiel #30
0
    def build_train_data(self):

        histories = []
        next_chars = []

        # 对语料文本进行分字、标注拼音
        for line in iter_file(self._corpus_file):
            words, pinyins = self.segment_pinyin_txt(line)

            histories_, next_chars_ = self.build_history_nextchars(words)
            histories += histories_
            next_chars += next_chars_

        X = np.zeros((len(histories), self._maxlen, len(self._chars)), dtype=np.bool)
        y = np.zeros((len(histories), len(self._chars)), dtype=np.bool)

        for i, history in enumerate(histories):
            for t, char in enumerate(history):
                X[i, t, self.vocab_char2idx(char)] = 1
            y[i, self.vocab_char2idx(next_chars[i])] = 1

        return X, y