Example #1
0
def build_ansible_yaml(ansible, output):
    override_properties = set(["to_create", "to_update", "from_response"])
    data = []
    for rn, v in ansible.items():
        overrides = v.get("overrides")
        if not overrides:
            continue

        pros = []
        for p, v1 in overrides.items():
            e = set(v1.keys()) - override_properties
            if e:
                raise Exception("find unspported override properties(%s) "
                                "for resource(%s)" % (" ".join(e), rn))

            v2 = {"property": p}
            v2.update(v1)

            pros.append(v2)

        data.append({"name": rn, "properties": pros})

    s = pystache.Renderer().render_path("template/ansible.mustache",
                                        {"resources": data})

    write_file(output + "ansible.yaml", [s])
Example #2
0
def generate_case(feature, yaml_name):
    ct = Case_Templates(feature, yaml_name)

    maincase = ct.case_templates_main()

    # 创建用例文件,写入用例头内容
    yaml_name1 = yaml_name.split(".")[0]
    test_path = os.path.join(TEST_PATH, "{}_test.py".format(yaml_name1))
    create_file(test_path, maincase)

    # 创建模块目录
    # feature_path = os.path.join(case_path(), feature)
    # mkdir(feature_path)

    # 读取yaml文件,写入用例方法内容
    try:
        rd = ReadFileData()
        yaml_all = rd.load_yaml(os.path.join(DATA_DIR, yaml_name))
        all_api_items = yaml_all.items()
        n = 0
        for k, v in all_api_items:
            n = n + 1
            yaml_title = k
            method = v['method']
            yaml_data = v['data']
            num = str(n).zfill(2)
            commoncase = ct.case_templates_common(yaml_title, method,
                                                  yaml_data, num)
            write_file(test_path, commoncase)
        print('文件生成完毕')
    except Exception as e:
        print(e)
        os.remove(test_path)  # 如果有异常,删除之前创建的文件
        print('文件生成失败,已自动删除')
Example #3
0
def do_parse(opt, filename):

    try:
        ls = subprocess.Popen([ djvutoxml, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)

        page_nr = 1
        for event, elem in etree.iterparse(XmlFile(ls.stdout)):
            if elem.tag.lower() == 'object':
                page = OcrPage()
                if not opt.silent:
                    print >> sys.stderr, page_nr, '\r',
                page.start_page(elem)
                parse_page(page, elem, page_nr)
                page.end_page(elem)

                filename = opt.out_dir + 'page_%04d.hocr' % page_nr

                if opt.compress:
                    text = page.get_hocr_html().encode('utf-8')
                    utils.compress_file_data(filename, text, opt.compress)
                else:
                    utils.write_file(filename, text)

                elem.clear()
                page_nr += 1

    finally:
        if not opt.silent:
            print >> sys.stderr

        ls.stdout.read()
        ls.wait()

    return ls.returncode
Example #4
0
    def test_extract(self):
        self.assertTrue(True)
        '''
        R = []
        source_dir = os.path.join(RESOURCE_DIR, 'parsed3')
        i = 0
        for d in os.listdir(source_dir):
            i += 1
            print i

            R += read_obj(os.path.join(source_dir, d))
            if len(R) >= 50000:
                break

        save_obj(R, os.path.join(RESOURCE_DIR, 'dp', 'dp.R'))
        '''

        R = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.R'))

        R = [sentence for _, parsed in R for sentence in parsed['sentences']]

        # R = ['价格实惠']

        print('单句总数:', len(R))

        O = {'不错', '漂亮', '流畅', '方便', '高', '持久'}

        F, O_expanded = double_propagation.extract(O, R, parsed=True)
        write_file(os.path.join(RESOURCE_DIR, 'dp', 'dp.features'), F)
        write_file(os.path.join(RESOURCE_DIR, 'dp', 'dp.opinions'), O_expanded)
Example #5
0
def build_ansible_yaml(info, output):
    data = []
    for v in info:
        config = v.get("custom_configs").get("ansible")
        if not config:
            continue

        r = {}
        examples = config.get("examples")
        if examples:
            _generate_example_config(examples, v, output)

        c = config.get("overrides")
        if c:
            _generate_override(c, v["api_info"], v["properties"],
                               v["resource_name"], r)

        if r:
            r["name"] = v["resource_name"]
            data.append(r)

    s = pystache.Renderer().render_path("template/ansible.mustache",
                                        {"resources": data})

    write_file(output + "ansible.yaml", [s])
Example #6
0
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(
                u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>'

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(
                    ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" %
                    html_id):
                exclude.add(it)

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
        else:
            text = utils.read_file(filename)

        return text
Example #7
0
    def test_labelExtractor_batch(self):
        self.assertTrue(True)

        feature_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                                    '_result', 'features.revised')
        opinion_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                                    '_result', 'opinions.revised')

        label_extractor = LabelExtractor(feature_file,
                                         opinion_file,
                                         sentence_prob_threshold=-10)
        '''
        labels = label_extractor.extract_from_txt(txt)
        for label in labels:
            print(label)
        '''

        counter = Counter()
        results = []
        comment_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                                    'sbracelet.txt')
        for i, line in enumerate(utils.iter_file(comment_file)):

            print(i)

            if i > 100:
                break

            # 句法解析
            txts = clean.clean_txt2(line)
            relations = []
            for txt in txts:
                sentences = label_extractor.preprocess(txt)
                for sentence in sentences:
                    sent = parser.parse2sents(sentence)[0]
                    relation = ' '.join([str(r) for r in sent.relations])
                    relations.append(relation)

            # 提取标签
            labels = label_extractor.extract_from_txt(line)
            for label in labels:
                fo = label.feature + label.opinion
                counter.update([fo])

            # print(line, '->', labels)
            results.append(line)
            results.append('->')
            results += relations
            results.append('->')
            for label in labels:
                results.append(str(label))
            results.append('')

        utils.write_file(
            os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                         'labels.result.txt'), results)

        for fo, c in counter.most_common():
            print(fo, c)
Example #8
0
    def test_x1(self):
        self.assertTrue(True)

        lines = []
        for i, line in enumerate(utils.iter_file(os.path.join(RESOURCE_DIR, 'mobile', 'std.txt'))):
            if i < 50000:
                lines.append(line)

        utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'std.5w.txt'), lines)
def run(file_name, output):
    tables = _parse_word(file_name)
    data = [Struct(k).add_property(v).to_map() for k, v in tables.items()]

    s = pystache.Renderer().render_path("struct.mustache", {"structs": data})

    if not os.path.isdir(output):
        os.makedirs(output)

    f = normal_dir(output) + file_name.split("/")[-1].split(".")[0] + ".yaml"
    write_file(f, [s])
Example #10
0
    def test_load_mobile_words(self):
        self.assertTrue(True)

        sentiments = load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1正面评价词_a+.txt'))
        sentiments |= load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1负面评价词_a-.txt'))

        features = load_feature_word(os.path.join(RESOURCE_DIR, 'mobile', 'mobile.ontology'))

        print(sentiments)
        print(features)
        print('sentiment words size: {}, feature words size: {}'.format(len(sentiments), len(features)))

        utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'mobile.words'), sentiments | features)
Example #11
0
    def create_standard_dataset(self):
        """
        读取原始的文本,清洗后,提取出规范句子,存储到文件
        :return:
        """
        sentences = []

        for line in iter_file(self._raw_file):
            txt = clean.clean_txt(line)
            sents = clean.extract_standard_sentences(txt)
            sentences += [sent for sent in sents if clean.is_meaningful(sent)]

        write_file(self._clean_file, sentences)
Example #12
0
    def test_normalize_revise_file(self):
        self.assertTrue(True)

        import html.parser

        html_parser = html.parser.HTMLParser()

        def tokens2str(tokens):
            return ' '.join(
                ['%s/%s' % (token.word, token.pos) for token in tokens])

        ss = []

        sb_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                               'sbracelet.txt')
        for i, line in enumerate(iter_file(sb_file)):
            print(i)

            # 还原html转义字符,&hellip; => ……
            line = html_parser.unescape(line)

            # Todo HTMLParser不能讲&#039;转换,而单独测试时是可以的,不知为何。。
            line = line.replace('&#039;', '\'')

            for sentence in combParser.ssplit(line):
                tokens = combParser.pos(sentence, revise=False)
                s1 = tokens2str(tokens)
                # ss.append('jba1- ' + s1)

                tokens = combParser.pos(sentence, revise=True)
                s2 = tokens2str(tokens)
                # ss.append('jba2- ' + s2)

                if s1 != s2:
                    ss.append('jba1- ' + s1)
                    ss.append('jba2- ' + s2)

                # tokens = ltpParser.pos(line)
                # ss.append('ltp1- ' + tokens2str(tokens))
                #
                # PosReviser.revise(tokens)
                # ss.append('ltp2- ' + tokens2str(tokens))

            if i > 1000:
                break

        write_file(
            os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                         'sbracelet.pos.1.txt'), ss)
Example #13
0
def run_diff(ocr_text, text, opt):
    ocr_text = u"\n".join(ocr_text) + u"\n"
    temp1 = tempfile.NamedTemporaryFile(suffix = '.txt')
    utils.write_file(temp1.name, ocr_text)
    text = u"\n".join(text) + u"\n"
    temp2 = tempfile.NamedTemporaryFile(suffix = '.txt')
    utils.write_file(temp2.name, text)
    cmdline = "diff -U %d " % opt.diff_context + temp1.name + " " + temp2.name
    fd = os.popen(cmdline)
    diff = ''
    for t in fd.readlines():
        diff += t
    fd.close()

    return unicode(diff, 'utf-8')
Example #14
0
def run_diff(ocr_text, text, opt):
    ocr_text = u"\n".join(ocr_text) + u"\n"
    temp1 = tempfile.NamedTemporaryFile(suffix='.txt')
    utils.write_file(temp1.name, ocr_text)
    text = u"\n".join(text) + u"\n"
    temp2 = tempfile.NamedTemporaryFile(suffix='.txt')
    utils.write_file(temp2.name, text)
    cmdline = "diff -U %d " % opt.diff_context + temp1.name + " " + temp2.name
    fd = os.popen(cmdline)
    diff = ''
    for t in fd.readlines():
        diff += t
    fd.close()

    return unicode(diff, 'utf-8')
Example #15
0
def _generate_example_config(examples, info, output):
    module_name = underscore("%s_%s_%s" %
                             (info["cloud_short_name"], info["service_type"],
                              info["resource_name"]))

    output += "examples/ansible/"
    if not os.path.isdir(output):
        os.makedirs(output)

    for f in examples:
        data = _build_example_render_info(info["config_dir"] + f, module_name,
                                          info["cloud_short_name"])

        s = pystache.Renderer().render_path(
            "template/ansible_example.mustache", data)

        write_file(output + os.path.basename(f), [s])
Example #16
0
    def test_sbd_file(self):
        self.assertTrue(True)

        model = SBDModel.load(
            keras_model_file=os.path.join(APP_RESOURCE_DIR, 'sbd.keras.model'))

        lines = []
        for line in iter_file(
                os.path.join(RESOURCE_DIR, 'tmp', 'comment.mobile.txt')):
            words = re.findall(r'[a-zA-Z0-9\u4e00-\u9fa5]', line)
            sent = ''.join(words)
            # sequence = model.predict_sequence(sent)
            pline = model.predict_txt(sent)
            lines.append('{} -> {}'.format(line, pline))
            print('{} -> {}'.format(line, pline))

        write_file(os.path.join(RESOURCE_DIR, 'tmp', 'sbd.result.txt'), lines)
Example #17
0
    def create_train_dataset(self):
        """
        标注训练数据,分字->标注
        <E>表示句子的最后一个字,<M>表示句子的非最后一个字

        特别好,发货很快,赞。 => <M> <M> <E> <M> <M> <M> <E> <E> 。
        :return:
        """
        lines = []

        for line in iter_file(self._clean_file):
            result = Labeler.label(line)
            token = ' '.join([t for t, _ in result])
            sequence = ' '.join([seq for _, seq in result])
            lines.append('%s\t%s' % (token, sequence))

        write_file(self._label_file, lines)
Example #18
0
def run_test():
    lm = BaseLM(os.path.join(LM_MODEL_DIR, 'hanzi.arpa'))

    from common.utils import iter_file
    from common.utils import write_file

    probs = []
    for line in iter_file(os.path.join(RESOURCE_DIR, 'tmp',
                                       'comment.test.txt')):
        for sent in re.split(r'[,。?!?,]', line):
            words = re.findall(r'[a-zA-Z0-9\u4e00-\u9fa5]', sent)
            sent = ''.join(words)
            if sent:
                prob = lm.predict_prob(sent)
                probs.append((sent, prob))

    sort_probs = sorted(probs, key=lambda tp: tp[1])

    write_file(os.path.join(RESOURCE_DIR, 'tmp', 'result.txt'),
               ['{} {}'.format(p, s) for s, p in sort_probs])
Example #19
0
def _generate_api_yaml(api_path, product_info, tag_info, output):
    r = [_render_product(product_info)]

    api_yaml = read_yaml(api_path + "api.yaml")
    all_models = read_yaml(api_path + "models.yaml")

    for tag, v in tag_info.items():

        custom_configs = read_yaml(api_path + tag + ".yaml")

        api_info, properties = generate_resource_properties(
            api_yaml, all_models, tag, custom_configs
        )

        r.extend(
            build_resource_config(
                api_info, properties, v,
                custom_configs, product_info["service_type"])
        )

    write_file(output + "api.yaml", r)
Example #20
0
def find_adj(corpus_file, dest_file, max_lines=200000):
    """
    规则:在【很/太/非常】后面的词通常是形容词
    根据上述规则,对语料文本进行分词,提取出【很/太/非常】后面的词,取交集
    """
    reg_rules = [r'[很]\s(\S+)', r'[非常]\s(\S+)']
    counter1, counter2 = find_by_rule(corpus_file, reg_rules, max_lines)

    adjs = []

    for w in counter1:
        if w in counter2:
            adjs.append(w)

    adjs = sorted(adjs, key=lambda w: counter1[w], reverse=True)

    lines1 = [x for x in adjs]
    lines2 = ['{} {}'.format(x, counter1[x]) for x in adjs]
    lines3 = ['{} pos {}'.format(x, counter1[x]) for x in adjs]

    basename = os.path.join(os.path.dirname(dest_file),
                            os.path.splitext(os.path.basename(dest_file))[0])
    utils.write_file(dest_file, lines1)
    utils.write_file(basename + '.2.txt', lines2)
    utils.write_file(basename + '.3.txt', lines3)

    return lines1
Example #21
0
    def test_show_count(self):
        self.assertTrue(True)

        ff_counter = utils.read_obj(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.counter'))
        oo_counter = utils.read_obj(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.counter'))
        fo_counter = utils.read_obj(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.counter'))

        ff_dict = utils.read_obj(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.dict'))
        oo_dict = utils.read_obj(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.dict'))
        fo_dict = utils.read_obj(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.dict'))

        print('-' * 10 + 'ff' + '-' * 10)
        for r, c in ff_counter.most_common(20):
            print(r, c)

        print('-' * 10 + 'oo' + '-' * 10)
        for r, c in oo_counter.most_common(20):
            print(r, c)

        print('-' * 10 + 'fo' + '-' * 10)
        for r, c in fo_counter.most_common(20):
            print(r, c)

        for relation in ff_dict:
            utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'samples', 'ff_{}.txt'.format(relation)),
                             ff_dict[relation])

        for relation in oo_dict:
            utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'samples', 'oo_{}.txt'.format(relation)),
                             oo_dict[relation])

        for relation in fo_dict:
            utils.write_file(os.path.join(RESOURCE_DIR, 'mobile', 'count', 'samples', 'fo_{}.txt'.format(relation)),
                             fo_dict[relation])
Example #22
0
 def run(self):
     gallery = Gallery().search(tgid = self.chat_id)
     if gallery:
         newfile = self.bot.getFile(self.update.message.document.file_id)
         file_name = self.update.message.document.file_id
         newfile.download(file_name)
         writed = False
         if os.path.exists(file_name):
             writed = write_file(file_name, read_file(file_name, storage = 'local', append_path = False), acl = 'public-read', mime_type = self.update.message.document.mime_type)
             thumbnail(file_name)
             os.remove(file_name)
             write_file('%s.json' % file_name, self.update.to_json())
         if writed:
             file_id = File(gallery_eid = gallery.eid.value, file_id = self.update.message.document.file_id)
             file_id.save()
             sendLink = getattr(gallery, 'sendLink', None)
             if sendLink and sendLink.value:
                 self.text = 'File URL: %s' % url_for('image', file_id = file_id.eid.value, _external = True, disable_web_page_preview = True)
         else:
             self.text = 'Failed to download file'
     else:
         self.text = 'Gallery does not exist, please create first'
Example #23
0
    def test_correct2(self):
        self.assertTrue(True)

        txts = []

        for i, line in enumerate(iter_file(os.path.join(RESOURCE_DIR, 'tmp', 'comment.mobile.tiny.txt'))):

            if i % 100 == 0:
                print(i)

            txt = std.extract_txt(line)

            sents = []
            for sent in parser.ssplit(line):

                # 提取中文、英文、数字
                sent = std.extract_txt(sent)

                if not sent:
                    continue

                '''纠错,只对中文纠错'''
                if not re.findall(r'[a-zA-Z0-9]', sent):
                    csent = std.wed(sent)
                    if sent != csent:
                        sent_prob = std.prob(sent)
                        csent_prob = std.prob(csent)

                        # 新文本的概率大于旧文本,即纠错
                        if csent_prob > sent_prob:
                            sent = csent

                sents.append(sent)

            ctxt = ''.join(sents)
            if ctxt != txt:
                txts.append('{} -> {}'.format(txt, ctxt))

        write_file(os.path.join(RESOURCE_DIR, 'tmp', 'correct.result.txt'), txts)
Example #24
0
def build_terraform_yaml(info, output):
    override_properties = set(["to_create", "to_update", "from_response"])
    data = []
    for rn, v in info.items():
        r = {}
        config = v.get("config")

        example = config.get("example")
        if example:
            e = _generate_example_config(example, v)
            if e:
                r.update(e)

        overrides = config.get("overrides")
        if overrides:
            pros = []
            for p, v1 in overrides.items():
                e = set(v1.keys()) - override_properties
                if e:
                    raise Exception("find unspported override properties(%s) "
                                    "for resource(%s)" % (" ".join(e), rn))

                v2 = {"property": p}
                v2.update(v1)

                pros.append(v2)

            r["properties"] = pros
            r["has_property_override"] = True

        if r:
            r["name"] = rn
            data.append(r)

    s = pystache.Renderer().render_path("template/terraform.mustache",
                                        {"resources": data})

    write_file(output + "terraform.yaml", [s])
Example #25
0
def _generate_yaml(api_path, product_info, tag_info, output):
    r = [_render_product(product_info)]

    platform_config = []

    api_yaml = read_yaml(api_path + "api.yaml")
    all_models = read_yaml(api_path + "models.yaml")

    for tag, v in tag_info.items():

        custom_configs = read_yaml(api_path + tag + ".yaml")

        api_info, properties = generate_resource_properties(
            api_yaml, all_models, tag, custom_configs)

        argv = {
            "config_dir": api_path,
            "api_info": api_info,
            "all_models": all_models,
            "properties": properties,
            "service_type": product_info["service_type"],
            "resource_name": _get_resource_name(v, custom_configs),
            "version": _get_version(api_info),
            "resource_desc": v.get("description", ""),
            "custom_configs": custom_configs,
            "cloud_full_name": product_info["cloud_full_name"],
            "cloud_short_name": product_info["cloud_short_name"],
        }

        r.extend(build_resource_config(**argv))

        r.extend(build_resource_api_config(**argv))

        platform_config.append(argv)

    write_file(output + "api.yaml", r)

    _generate_platform_yaml(platform_config, all_models, output)
Example #26
0
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):
                exclude.add(it)

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
        else:
            text = utils.read_file(filename)

        return text
Example #27
0
    def test_normalize_revise_file_count(self):
        self.assertTrue(True)

        import html.parser

        counter = Counter()
        dd = defaultdict(set)

        html_parser = html.parser.HTMLParser()

        def tokens2str(tokens):
            return ' '.join(
                ['%s/%s' % (token.word, token.pos) for token in tokens])

        total = 0
        rc = 0
        dayu10 = 0
        sb_file = os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                               'sbracelet.txt')
        for i, line in enumerate(iter_file(sb_file)):

            # 还原html转义字符,&hellip; => ……
            line = html_parser.unescape(line)

            # Todo HTMLParser不能讲&#039;转换,而单独测试时是可以的,不知为何。。
            line = line.replace('&#039;', '\'')

            for sentence in combParser.ssplit(line):
                total += 1

                tokens1 = combParser.pos(sentence, revise=False)
                s1 = tokens2str(tokens1)
                # ss.append('jba1- ' + s1)

                if len(tokens1) < 10:
                    dayu10 += 1
                    tokens2 = combParser.pos(sentence, revise=True)
                else:
                    tokens2 = tokens1

                s2 = tokens2str(tokens2)
                # ss.append('jba2- ' + s2)

                for t1, t2 in zip(tokens1, tokens2):
                    if t1.pos != t2.pos:
                        rc += 1

                        fmt = '%s -> %s' % ('%s/%s' %
                                            (t1.word, t1.pos), '%s/%s' %
                                            (t2.word, t2.pos))
                        counter.update([fmt])
                        dd[fmt].add((s1, s2))

                # if s1 != s2:
                #     ss.append('jba1- ' + s1)
                #     ss.append('jba2- ' + s2)

                # tokens = ltpParser.pos(line)
                # ss.append('ltp1- ' + tokens2str(tokens))
                #
                # PosReviser.revise(tokens)
                # ss.append('ltp2- ' + tokens2str(tokens))

            if i > 20000:
                break

        for x, c in counter.most_common():
            print(x, c)

        lines = []
        for x, c in counter.most_common():
            lines.append('-------%s %d-------' % (x, c))
            samples = dd[x]
            for sample in samples:
                lines.append(sample[0])
                lines.append(sample[1])

        write_file(
            os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                         'sbracelet.pos.2.txt'), lines)

        print('total: %d, revise count: %d, dayu10: %d' % (total, rc, dayu10))
        '''词性之间的转换概率'''
        # {'好': {'好/d': 10, '好/a': 14}}
        cdict = defaultdict(lambda: defaultdict(int))
        for s, c in counter.most_common():
            w = s.split('->')[1].strip().split('/')[0]
            cdict[w][s] += c

        lines = []
        for w, ss in sorted(cdict.items(),
                            key=lambda tp: sum(tp[1].values()),
                            reverse=True):
            lines.append('-----------%s-----------' % w)
            for p, c in ss.items():
                lines.append('%s %d' % (p, c))

        write_file(
            os.path.join(RESOURCE_DIR, 'tmp', 'sbracelet',
                         'sbracelet.pos.3.txt'), lines)
Example #28
0
    def run(self, pinglun_file, O_seeds):
        """
        提取特征词/评价词
        :param pinglun_file: 评论文本
        :param O_seeds: 种子评价词
        :return:
        """
        logger.info('pipeline run...')

        if not os.path.exists(self._clean_file):
            logger.info('清洗文本')
            clean.clean_file(pinglun_file, self._clean_file)

        if not os.path.exists(self._relation_file):
            logger.info('句法解析')
            relation_parse.parse(self._clean_file, self._relation_file)

        logger.info('提取特征词/评价词, double propagation算法')
        S = self._iter_sentences_relations(self._relation_file)
        F, O, fcounter, ocounter, rcount = double_propagation.extract(
            O_seeds, S)

        utils.write_file(self._dp_f_file, F)
        utils.write_file(self._dp_o_file, O)
        utils.save_obj(fcounter, self._dp_f_counter)
        utils.save_obj(ocounter, self._dp_o_counter)

        logger.info('特征词/评价词剪枝')
        F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold)

        utils.write_file(self._prune_f_file, F)
        utils.write_file(self._prune_o_file, O)

        if not os.path.exists(self._word2vec_file):
            logger.info('训练word2vec模型')
            T = self._iter_sentences_tokens(self._relation_file)
            w2c.train(T, self._word2vec_file)

        model = w2c.get(self._word2vec_file)

        logger.info('聚类特征词')
        cf = cluster.create(F, model, preference=-30)
        features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf]
        utils.write_file(self._feature_file, features)

        logger.info('聚类评价词')
        O = utils.read_file(self._prune_o_file)
        of = cluster.create(O, model, preference=None)
        opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of]
        utils.write_file(self._opinion_file, opinions)

        logger.info('pipeline over.')

        return cf, of, F, O
Example #29
0
def telegramWebHook():
    update = Update.de_json(request.get_json(force=True))
    text = None
    if getattr(update.message, 'document'):
        gallery = Gallery().search(tgid = update.message.chat.id)
        if gallery:
            newfile = bot.getFile(update.message.document.file_id)
            file_name = update.message.document.file_id
            newfile.download(file_name)
            writed = False
            if os.path.exists(file_name):
                writed = write_file(file_name, read_file(file_name, storage = 'local', append_path = False), acl = 'public-read', mime_type = update.message.document.mime_type)
                thumbnail(file_name)
                os.remove(file_name)
                write_file('%s.json' % file_name, update.to_json())
            if writed:
                file_id = File(gallery_eid = gallery.eid, file_id = update.message.document.file_id)
                file_id.save()
                sendLink = getattr(gallery, 'sendLink', None)
                if sendLink == 'True':
                    text = 'File URL: %s' % url_for('image', file_id = file_id.eid, _external = True, disable_web_page_preview = True)
            else:
                text = 'Failed to download file'
        else:
            text = 'Gallery does not exist, please create first'
        pass
    if getattr(update.message, 'text'):
        args = update.message.text.split(' ', 2)
        if args[0] == '/register':
            text = 'Username:'******'Complete register: https://telegram.me/ACSGalleryBot?start=%s' % update.message.from_user.id
            else:
                text = 'User added to gallery'
            # set gallery permission at this point because i have chat id
        elif args[0] == '/start':
            if len(args) > 1 and int(args[1]) == int(update.message.chat.id):
                text = 'Username:'******'force_reply' : True })
            else:
                text = update.to_json()

        elif getattr(update.message, 'reply_to_message'):
            if update.message.reply_to_message.text == 'Username:'******'Password:'******'force_reply' : True })
                return 'ok'
            elif update.message.reply_to_message.text == 'Password:'******'User succesfuly registered'
        elif args[0] == '/create':
            if hasattr(update.message.chat, 'title'):
                gallery = Gallery().search(tgid = update.message.chat.id)
                if not gallery:
                    gallery = Gallery(tgid = update.message.chat.id, title = update.message.chat.title).save()
                text = 'Gallery URL: %s' % url_for('gallery', id = gallery.eid, _external = True, _scheme = 'https')
            else:
                text = 'Bot only works in groups'
        elif args[0] == '/remove':
            gallery = Gallery().search(tgid = update.message.chat.id)
            if gallery:
                gallery.delete()
                text = 'Gallery deleted'
            else:
                text = 'Gallery is not registered'
            # TODO: Confirm
        elif args[0] == '/config':
            args.pop(0)
            gallery = Gallery.search(tgid = update.message.chat.id)
            if gallery:
                if len(args) == 0:
                    text = g.config(update.message.chat.id)
                elif len(args) == 1:
                    text = 'get one'
                    text = g.config(update.message.chat.id, args[0])
                else:
                    text = g.config(update.message.chat.id, args[0], args[1])
            else:
                text = 'Gallery is not registered'
        #else:
        #    text = update.to_json()
    if text:
        bot.sendMessage(update.message.chat.id, text, disable_web_page_preview=True)
    return ""