Esempio n. 1
0
    def predict(self):
        f = open("model/data_map.pkl", "rb")
        maps = cPickle.load(f)
        f.close()
        self.batch_size = 1
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(FLAGS.spm)
        self.train_length = 10

        self.tag_map = maps.get("tag_map", {})
        self.nums_tags = len(self.tag_map.values())
        self.__creat_model()
        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                print("[->] restore model")
                self.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                print("[->] no model, initializing")
                sess.run(tf.global_variables_initializer())

            trans = self.trans.eval()
            dataset = []

            with open('data/test_raw_big.txt', 'r', encoding="utf-8") as f:
                for data in f.readlines():
                    ent, raw_con, info = data.replace('\n', '').split('\t')
                    dataset.append([ent, raw_con, info])
            f.close()
            for ele in dataset:
                info = ele[2]
                text = info

                feed = self.prepare_xlnet_pred_data(text)

                paths, length = sess.run([self.pred_ids, self.length],
                                         feed_dict=feed)

                print(format_tags(paths[0], self.tag_map))
                org = get_tags(paths[0], "", self.tag_map)
                org_entity = format_result(org, text, "")
                per = get_tags(paths[0], "", self.tag_map)
                per_entity = format_result(per, text, "")
                loc = get_tags(paths[0], "", self.tag_map)
                loc_entity = format_result(loc, text, "")

                resp = org_entity["entities"] + per_entity[
                    "entities"] + loc_entity["entities"]
                ele.append(str(resp))

            with open('data/test_result.txt', 'w', encoding="utf-8") as f1:
                for ele in dataset:
                    f1.write(ele[2])
                    f1.write('\t')
                    f1.write(ele[1])
                    f1.write('\t')
                    f1.write(ele[3])
                    f1.write('\n')
            f1.close()
            '''
Esempio n. 2
0
    def predict(self):
        f = open("data/data_map.pkl", "rb")
        maps = cPickle.load(f)
        f.close()
        self.vocab = maps.get("vocab", {})
        self.tag_map = maps.get("tag_map", {})
        self.nums_tags = len(self.tag_map.values())
        self.input_size = maps.get("input_size", 10000) + 1
        self.__creat_model()
        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                print("[->] restore model")
                self.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                print("[->] no model, initializing")
                sess.run(tf.global_variables_initializer())

            trans = self.trans.eval()
            while True:
                text = input(" > ")
                feed = self.prepare_pred_data(text)

                logits, length = sess.run([self.logits, self.length],
                                          feed_dict=feed)
                paths = self.decode(logits, length, trans)
                org = get_tags(paths[0], "ORG", self.tag_map)
                org_entity = format_result(org, text, "ORG")
                per = get_tags(paths[0], "PER", self.tag_map)
                per_entity = format_result(per, text, "PER")

                resp = org_entity["entities"] + per_entity["entities"]
                print(json.dumps(resp, indent=2, ensure_ascii=False))
Esempio n. 3
0
def image_search(request):
    print request
    if request.method == 'GET':

        utils.get_tags(request.GET.get('image_url'))
        classification = utils.run_nmf()
        response = {"id": classification}
        return HttpResponse(json.dumps(response), content_type="application/json")
    return None
Esempio n. 4
0
    def predict(self, input_str="", input_path=None):
        if input_path is not None:
            tests = pd.read_csv(input_path)
            with open('output.txt', 'w', encoding='utf-8') as o:
                #o.write('id,aspect,opinion\n')
                for ids in range(1, 2235):
                    input_str = self.get_string(
                        str(tests.loc[ids - 1:ids - 1, ['Review']]))
                    index = int(
                        self.get_string(str(tests.loc[ids - 1:ids - 1,
                                                      ['id']])))
                    input_vec = [self.vocab.get(i, 0) for i in input_str]
                    # convert to tensor
                    if (self.use_gpu):  # gpu加速
                        sentences = torch.tensor(input_vec).view(1, -1).cuda()
                    else:
                        sentences = torch.tensor(input_vec).view(1, -1)
                    _, paths = self.model(sentences)

                    entities = []
                    for tag in self.tags:
                        tags = get_tags(paths[0], tag, self.tag_map)
                        entities += format_result(tags, input_str, tag)
                    entities = sorted(entities, key=lambda x: x['start'])
                    #print(str(index) + "  " + input_str + " " +str(len(entities)))
                    for entity in entities:
                        #print(entity)
                        o.write(
                            str(index) + ',' + entity['type'] + ',' +
                            entity['word'] + '\n')
        else:
            if not input_str:
                input_str = input("请输入文本: ")
            input_vec = [self.vocab.get(i, 0) for i in input_str]
            # convert to tensor
            if (self.use_gpu):  # gpu加速
                sentences = torch.tensor(input_vec).view(1, -1).cuda()
            else:
                sentences = torch.tensor(input_vec).view(1, -1)
            _, paths = self.model(sentences)

            entities = []
            for tag in self.tags:
                tags = get_tags(paths[0], tag, self.tag_map)
                entities += format_result(tags, input_str, tag)
            return entities
    def test(self):
        with torch.no_grad():
            id2vocab = {self.vocab[i]: i for i in self.vocab}
            print(len(id2vocab))
            f = open('./result/test_tag.json', 'w')
            total_matrix = np.zeros(
                [len(self.tags), 3]
            )  #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1
            count = 0
            for batch in self.dev_manager.get_batch():
                count += 1
                print(count)
                #                 print(type(items))
                sentences, labels, length = zip(*batch)
                #             sentences, labels, length = zip(*self.dev_batch.__next__())
                #                 print('I am in')
                strs = [[id2vocab[w] for w in s] for s in sentences]
                #                 print(strs)
                #                 print(len(sentences),len(sentences[0]),len(sentences[5]))
                _, paths = self.model(sentences)
                #                 print("\teval")
                #                 print('path',len(paths),len(paths[0]),len(paths[1]))
                for i in range(len(self.tags)):
                    recall, precision, f1 = f1_score(labels, paths,
                                                     self.tags[i],
                                                     self.model.tag_map)
                    total_matrix[i][0] += recall
                    total_matrix[i][1] += precision
                    total_matrix[i][2] += f1
                entities = []
                for i in range(len(paths)):
                    tmp = []

                    for tag in self.tags:
                        tags = get_tags(paths[i], tag, self.tag_map)
                        tmp += format_result(tags, strs[i], tag)
                    entities.append(tmp)

    #             print(entities)
                for i in range(len(entities)):
                    dic = {
                        'sentense': ''.join(strs[i]),
                        'entities': entities[i]
                    }
                    json.dump(dic, f, ensure_ascii=False)


#                     f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n')
            total_matrix /= count
            #             print(total_matrix)
            for i in range(len(self.tags)):
                print(
                    "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}"
                    .format(count, self.tags[i], total_matrix[i][0],
                            total_matrix[i][1], total_matrix[i][2]))
            f.close()
Esempio n. 6
0
    def predict(self):
        f = open("model/data_map.pkl", "rb")
        maps = cPickle.load(f)
        f.close()
        self.batch_size = 1
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(FLAGS.spm)
        self.train_length = 10

        self.tag_map = maps.get("tag_map", {})
        self.nums_tags = len(self.tag_map.values())
        self.__creat_model()
        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                print("[->] restore model")
                self.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                print("[->] no model, initializing")
                sess.run(tf.global_variables_initializer())

            trans = self.trans.eval()
            while True:
                text = input(" > ")

                feed = self.prepare_xlnet_pred_data(text)

                paths, length = sess.run([self.pred_ids, self.length],
                                         feed_dict=feed)

                print(format_tags(paths[0], self.tag_map))
                org = get_tags(paths[0], "ORG", self.tag_map)
                org_entity = format_result(org, text, "ORG")
                per = get_tags(paths[0], "PER", self.tag_map)
                per_entity = format_result(per, text, "PER")
                loc = get_tags(paths[0], "LOC", self.tag_map)
                loc_entity = format_result(loc, text, "LOC")

                resp = org_entity["entities"] + per_entity[
                    "entities"] + loc_entity["entities"]
                print(json.dumps(resp, indent=2, ensure_ascii=False))
Esempio n. 7
0
    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)

        entities = []
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        return entities
Esempio n. 8
0
    def predict(self, tag, input_str=""):
        model.load_state_dict(torch.load("./model/params.pkl"))
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [word2id.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        paths = model(sentences)

        entities = []
        tags = get_tags(paths[0], tag, tag2id)
        entities += format_result(tags, input_str, tag)
        print(entities)
Esempio n. 9
0
def run_model(model_name, vocab, tf):
    #(vocab,tf) = utils.read_corpus()
    N = len(tf) # number of docs in corpus
    weights = []
    if model_name == "bm25":
        weights = bm25(tf)
    elif model_name == "tfidf":
        weights = tfidf(tf)

    top_words = []
    for i in range(N):
        gen_tags = utils.get_tags(vocab, weights, i)
        top_words += gen_tags
    return top_words
Esempio n. 10
0
def run_model(model_name, vocab, tf):
    speech_info = utils.read_speech_info()
    N = len(speech_info) # number of docs in corpus
    weights = []
    if model_name == "bm25":
        weights = bm25(tf)
    elif model_name == "tfidf":
        weights = tfidf(tf)

    top_words = {}
    for i in range(N):
        gen_tags = utils.get_tags(vocab, weights, i)
        top_words[speech_info[i]] = gen_tags
    return top_words
Esempio n. 11
0
def run_model(model_name, vocab, tf):
    #(vocab,tf) = utils.read_corpus()
    N = len(tf)  # number of docs in corpus
    weights = []
    if model_name == "bm25":
        weights = bm25(tf)
    elif model_name == "tfidf":
        weights = tfidf(tf)

    top_words = []
    for i in range(N):
        gen_tags = utils.get_tags(vocab, weights, i)
        top_words += gen_tags
    return top_words
def get_lastest_tag(repo_name, today):
    logger.debug('Init config')

    config = init_config(repo_name)

    environment = config['environments'][env_name]

    env_key = environment['env_key'][repo_name]

    tag_re_patten = config['tag_re_patten']

    logger.debug('Searched tag patten is {}'.format(tag_re_patten))

    git_path = git_folder_path + repo_name

    logger.debug('Git path is {}'.format(git_path))

    logger.debug('Init repo')

    repo = git.Repo.init(path=git_path)

    repo.git.fetch()

    logger.debug('Fetch remote tags')

    all_tag = repo.git.ls_remote('--tags')

    tags = get_tags(all_tag)

    logger.debug('Start to find latest tag')

    tag_re_patten = tag_re_patten.format(env_key, today)

    tag_name = find_latest_tag(tag_re_patten, tags)

    if tag_name is None:
        logger.warning(
            'Can\'t find matched tag of {}. Maybe no tag today, or check tag patten.'
            .format(repo_name))
        return None

    logger.info('The latest tag of {} is {}'.format(repo_name, tag_name))

    return tag_name
Esempio n. 13
0
    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        # 获取输入句子所有汉字的在vocab的索引
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1)
        sentences = sentences.cuda()
        # paths 预测出来的标签索引 shape 为 [1,1]
        _, paths = self.model(sentences)

        entities = []
        # "tags": ["ORG", "PER"]
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        print(entities)
        print(json.dumps(entities, indent=4, ensure_ascii=False))
        return entities
    def predict(self, path):  #, input_str=""):
        #         if not input_str:
        #             input_str = input("请输入文本: ")
        sentences = []
        with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f:
            for i in f:
                sentences += i.strip().split('。')
        f = open('./result/tag_' + path + '.json', 'w')
        for input_str in sentences:
            input_vec = [self.vocab.get(i, 0) for i in input_str]
            # convert to tensor
            sentences = torch.tensor(input_vec).view(1, -1)
            _, paths = self.model(sentences)

            entities = []
            for tag in self.tags:
                tags = get_tags(paths[0], tag, self.tag_map)
                entities += format_result(tags, input_str, tag)
            dic = {'sentense': input_str, 'entities': entities}
            json.dump(dic, f, ensure_ascii=False)
        f.close()
Esempio n. 15
0
def retrieve_products_for_interest(interest):
    list_url = "{}{}/{}-gifts{}".format(BASE_URL, LIST_URL, interest,
                                        QUERY_STR)
    html = retrieve_data("uncommon-goods.{}.html".format(interest), list_url)
    soup = bs(html, "html.parser")
    prod_links = [link["href"] for link in soup.select("article.product a")]

    for link in prod_links[:100]:
        prod_link = "{}{}".format(BASE_URL, link)
        fname = "{}.{}.html".format("uncommon-goods", link_to_fname(link))

        print("Fetching {}...".format(prod_link))
        html = retrieve_data(fname, prod_link)
        soup = bs(html, "html.parser")

        try:
            title = soup.find("h1", {"itemprop": "name"}).get_text()
            title = clean_whitespace(title)
            description = soup.select_one(".theStoryCopy p").get_text()
            description = clean_whitespace(description)
            image = soup.select_one("a#mainImage img")["src"]
            if not image.startswith("http"):
                image = "{}{}".format(BASE_URL, image)
            price = soup.find("span", {"itemprop": "price"}).get_text()
            price = float(clean_whitespace(price))
            tags = get_tags(description)
            product = Product(title,
                              "{}{}".format(BASE_URL, link),
                              image,
                              interest,
                              tags,
                              description,
                              price=price)
            product.dump()
        except Exception as e:
            print("ERROR:", e)
        print("")
Esempio n. 16
0
# -*- coding:utf-8 -*-
from .api_base import JsonHandler
from utils import get_tags, get_tags_v2, get_tags_parents, get_tags_v3
# get_tags_v2_by_name
from db import Tag, Share, User
import tornado
import time
import copy
d_tags = get_tags()
d_tags_v2 = get_tags_v2()
d_tags_v3 = get_tags_v3()

d_tags_parents = get_tags_parents()


# get_tags_v2_by_name
class TagsV2Handler(JsonHandler):
    def get(self):
        ver = self.get_argument("ver", 3)
        name = self.get_argument("name", '')
        sid = self.get_argument("id", 0)
        ver = int(ver)
        sid = int(sid)
        # parents [0]
        if name or sid:
            # 具体某个标签
            if not name and sid:
                tag = Tag.by_sid(sid)
                name = tag['name']
            self.res = d_tags_v3.get(name, {})
Esempio n. 17
0
    def get(self):
        page = self.get_argument("page", 1)
        per_page = self.get_argument("per_page", 10)
        meta_info = self.get_argument("meta_info", None)
        my_tags = self.get_argument("my_tags", None)
        tag = self.get_argument('tag', '')

        per_page = int(per_page)
        page = int(page)
        user = None
        tags = None

        token = self.request.headers.get('Authorization', '')
        if token:
            key, token = token.split()
            if key == 'token' and token:
                user_json = self.get_secure_cookie('user', token)
                if user_json:
                    user = json_decode(user_json)
        else:
            user_json = self.get_secure_cookie("user")
            if user_json:
                user = json_decode(user_json)

        print(user)
        print(my_tags)
        if user and my_tags:
            d_user = User.by_sid(user['user_id'])
            print(d_user, 1111)
            if d_user:
                print(d_user['user_tags'])
                tags = d_user['user_tags']

        vote_open = self.get_argument("vote_open", None)
        has_vote = self.get_argument("has_vote", None)
        cond = {}
        if tags:
            print('1111111111111', tags)
            cond['tags'] = {"$in": tags}

        elif tag:
            cond['tags'] = tag
        if user:
            logger.info('user_id: {}'.format(user['user_id']))
        if user and user['user_id'] in wx_admin_ids:
            cond['status'] = {'$gte': 1}
        else:
            cond['status'] = {'$gte': 1}
        if vote_open:
            if not vote_open.isdigit():
                return self.write_error(422)
            cond['vote_open'] = int(vote_open)
        if has_vote:
            cond['vote_title'] = {'$ne': ''}

        number = Share.find(cond, {'_id': 0}).count()
        shares = Share.find(cond, {'_id': 0}).sort(
            '_id', -1).limit(per_page).skip((page - 1) * per_page)
        shares = [fix_share(share) for share in shares]
        # if tag:
        #     shares = [share for share in shares if tag in share['tags']]
        meta = {}
        if meta_info and tag:
            d_tags = get_tags()
            # d_tags_parent = get_tags_parent()
            d_tags_parents = get_tags_parents()

            if tag in d_tags:
                sub_tags = []
                print(d_tags[tag])
                for name in d_tags[tag]:
                    num = Share.find({'tags': name}, {'_id': 0}).count()
                    num_recent = Share.find(
                        {'tags': name, 'published': {'$gt': time.time()-86400*30}}, {'_id': 0}).count()
                    info = {}
                    info['name'] = name
                    info['num'] = num
                    info['num_recent'] = num_recent
                    sub_tags.append(info)
                meta['sub_tags'] = sub_tags
            meta['parent_tags'] = []
            if tag in d_tags_parents:
                # hypernym
                # meta['parent_tags'].append(d_tags_parent[tag])
                meta['parent_tags'] = d_tags_parents[tag]

        self.res = list(shares)
        self.meta = meta
        print(meta)
        # number=len(self.res)
        return self.write_json(number=number)
Esempio n. 18
0
def main(repo_name):
    logger.debug('init config')

    config = init_config(repo_name)

    environment = config['environments'][env_name]

    env_key = environment['env_key'][repo_name]

    today = datetime.date.today().strftime('%Y%m%d')

    logger.debug('get today is ' + today)

    git_path = git_folder_path + repo_name

    logger.info('git path is ' + git_path)

    logger.debug('init repo')

    repo = git.Repo.init(path=git_path)

    # Need prune for branch is deleted then created with same name.
    repo.git.fetch('--prune')

    logger.debug('get remote tags')

    all_tag = repo.git.ls_remote('--tags')

    tags = get_tags(all_tag)

    logger.debug('find latest tag')

    tag_re_patten = config['tag_re_patten']

    tag_re_patten = tag_re_patten.format(env_key, today)

    tag_name = find_latest_tag(tag_re_patten, tags)

    if tag_name is None:
        raise TagNotFoundException()

    logger.info('latest tag is ' + tag_name)

    logger.debug('find latest branch')

    branches = repo.git.branch('-r').split('\n')

    merged_branch_name_re_patten = config[
        'merged_branch_name_re_patten'].format(env_key, today)

    latest_branch = find_latest_branch(merged_branch_name_re_patten, branches)

    branch_index = get_branch_index(latest_branch)

    new_branch_name = config['new_branch_name_patten'].format(
        env_key, today, branch_index)

    logger.info('new branch is ' + new_branch_name)

    logger.debug('check branch is exists or not')

    is_exists = some(branches, lambda b: new_branch_name in b)

    if is_exists:
        raise BranchIsExistException(new_branch_name)
    else:
        logger.debug('branch is not exists')

    logger.debug('create branch')

    repo.git.checkout(tag_name, '-b', new_branch_name)

    logger.debug('push branch')

    repo.git.push('origin', new_branch_name)

    logger.debug('checkout to dev')

    repo.git.checkout(source_branch)

    logger.debug('get branch diff commit')

    all_log = repo.git.log(
        'origin/{}..origin/{}'.format(config['uat_branch'], new_branch_name),
        '--oneline', '--no-merges')

    logger.debug('build pull request desc')

    pr_desc = build_pr_desc(all_log)

    logger.debug('create request service')

    request_service = RequestService(config['host'], config['headers'], auth)

    logger.debug('get reviewers')

    uat_branch = config['uat_branch']

    default_reviewers_api = config['default_reviewers_api'].format(repo_name)

    reviewers = get_reviewers(request_service, default_reviewers_api,
                              uat_branch, new_branch_name)

    logger.debug('build pull request obj')

    pr_obj = build_pr_obj(new_branch_name, uat_branch, pr_desc, reviewers)

    logger.debug('post to create pull request')

    pull_requests_api = config['pull_requests_api'].format(repo_name)

    rs = post_pr(request_service, pull_requests_api, pr_obj)

    if rs.status_code != 201:
        logger.error('{} {} create pull request failed.'.format(
            repo_name, new_branch_name))
        status_code = rs.status_code
        result = json.loads(rs.text, encoding='utf-8')
        message = result['errors'][0]['message']
        raise BitbucketException(status_code, message, new_branch_name)

    logger.info('create pull request success.')
    logger.info('finish')
Esempio n. 19
0
    def get(self):
        token = self.request.headers.get('Authorization', '')
        page = self.get_argument("page", 1)
        per_page = self.get_argument("per_page", 10)
        tag = self.get_argument('tag', '')
        filter_type = self.get_argument("filter_type", '')  # my_tags my_likes
        last_suggested = self.get_argument("last_suggested", 0)
        read_status = self.get_argument('read_status', 1)
        meta_info = self.get_argument("meta_info", 1)

        read_status = int(read_status)
        per_page = int(per_page)
        page = int(page)
        if not last_suggested:
            last_suggested = 0
        last_suggested = float(last_suggested) / 1000 + 1

        user = self.get_user_dict(token)

        cond = {}
        tags = None
        if user and filter_type == 'my_tags':
            d_user = User.by_sid(user['user_id'])
            if d_user:
                tags = d_user['user_tags']
        # 按照tag来过滤
        if tags:
            cond['tags'] = {"$in": tags}
        elif tag:
            cond['tags'] = tag

        # 不同的用户显示不同级别的推荐
        # if user and user['user_id'] in wx_admin_ids:
        if user and user['user_id'] == 1:
            cond['status'] = {'$gte': 1}
        else:
            cond['status'] = {'$gte': 1}

        # 已读列表 20ms
        l_hitted_share_id = []
        if user and read_status:
            hits = Hit.find({'user_id': user['user_id']}, {
                '_id': 0,
                'share_id': 1
            })
            l_hitted_share_id = [i['share_id'] for i in hits]

        filter_d = {}
        filter_d['_id'] = 0
        # 白名单里的属性才展示
        filter_d['id'] = 1
        filter_d['images'] = 1
        filter_d['title'] = 1
        filter_d['user_id'] = 1
        filter_d['tags'] = 1
        filter_d['published'] = 1
        filter_d['post_img'] = 1
        shares = Share.find(cond, filter_d).sort('suggested',
                                                 -1).limit(per_page).skip(
                                                     (page - 1) * per_page)
        # 过滤
        new_shares = []
        for share in shares:
            user = User.by_sid(share.user_id)
            # share = dict(share)
            share['type'] = 1
            # if share.post_img:
            # if hasattr(share, 'post_img'):
            if share.get('post_img'):
                share['type'] = 2
                share['images'] = [
                    IMG_BASE +
                    share['post_img'].replace('_1200.jpg', '_260.jpg')
                ]
                share.pop('post_img')
            else:
                share['images'] = []
            share['author'] = user.user_name
            share['published'] = int(share['published'] *
                                     1000)  # share.published
            if read_status:
                share['read'] = bool(share['id'] in l_hitted_share_id)

            if 0:  # 不展示作者头像
                if user.user_email.endswith('@wechat'):
                    share['user_img'] = options.site_url + \
                        get_avatar_by_wechat(user._id)
                if user.user_email.endswith('@anwensf.com'):
                    share['user_img'] = options.site_url + \
                        get_avatar_by_feed(user.id)
                else:
                    share['user_img'] = options.site_url + \
                        get_avatar(user.user_email, 100)
            new_shares.append(share)

        if meta_info:
            meta = {}
            if last_suggested:
                cond_update = copy.deepcopy(cond)
                cond_update['suggested'] = {'$gt': last_suggested}
                number_of_update = Share.find(cond_update, {
                    '_id': 0,
                    'id': 1
                }).count()
                meta['number_of_update'] = number_of_update
            if tag:  # 子标签的文章数量
                d_tags = get_tags()
                d_tags_parents = get_tags_parents()  # get_tags_parent
                if tag in d_tags:
                    sub_tags = []
                    for name in d_tags[tag]:
                        info = {}
                        info['name'] = name
                        # num = Share.find({'tags': name}, {'_id': 0}).count()
                        # num_recent = Share.find(
                        #     {'tags': name, 'published': {'$gt': time.time() - 86400 * 30}}, {'_id': 0}).count()
                        # info['num'] = num
                        # info['num_recent'] = num_recent
                        sub_tags.append(info)
                    meta['sub_tags'] = sub_tags
                meta['parent_tags'] = []
                if tag in d_tags_parents:
                    # meta['parent_tags'].append(d_tags_parent[tag])
                    meta['parent_tags'] = d_tags_parents[tag]  # hypernym
            number = Share.find(cond, {'_id': 0}).count()  # 'id': 1
            meta['number'] = number
            # if filter_type == 'my_tags':
            #     meta['tags'] = tags

        self.res = {'articles': new_shares}
        self.meta = meta
        return self.write_json()
Esempio n. 20
0
    def get(self):
        page = self.get_argument("page", 1)
        per_page = self.get_argument("per_page", 10)
        filter_type = self.get_argument("filter_type", '')  # my_tags
        tag = self.get_argument('tag', '')
        meta_info = self.get_argument("meta_info", 1)
        last_suggested = self.get_argument("last_suggested", 0)
        read_status = self.get_argument('read_status', 1)
        token = self.request.headers.get('Authorization', '')
        # has_vote = self.get_argument("has_vote", None)
        # vote_open = self.get_argument("vote_open", None)

        read_status = int(read_status)
        per_page = int(per_page)
        page = int(page)
        last_suggested = float(last_suggested) / 1000 + 1
        user = self.get_user_dict(token)

        cond = {}
        # 按照tags来过滤
        tags = None
        if user and filter_type == 'my_tags':
            d_user = User.by_sid(user['user_id'])
            if d_user:
                tags = d_user['user_tags']
        # 按照tag来过滤
        if tags:
            cond['tags'] = {"$in": tags}
        elif tag:
            cond['tags'] = tag

        # 不同的用户显示不同级别的推荐
        if user and user['user_id'] in wx_admin_ids:
            cond['status'] = {'$gte': 1}
        else:
            cond['status'] = {'$gte': 1}

        l_hitted_share_id = []
        if user and read_status:
            hits = Hit.find({'user_id': user['user_id']})
            l_hitted_share_id = [i['share_id'] for i in hits]

        # if vote_open:
        #     if not vote_open.isdigit():
        #         return self.write_error(422)
        #     cond['vote_open'] = int(vote_open)
        # if has_vote:
        #     cond['vote_title'] = {'$ne': ''}

        number = Share.find(cond, {'_id': 0}).count()
        # sort: _id
        if last_suggested:
            cond_update = copy.deepcopy(cond)
            cond_update['suggested'] = {'$gt': last_suggested}
            number_of_update = Share.find(cond_update, {'_id': 0}).sort(
                'suggested', -1).count()
            logger.info('number_of_update 1: {}'.format(number_of_update))

        num_shares = Share.find(cond, {'_id': 0, 'id': 1}).count()

        shares = Share.find(cond, {'_id': 0}).sort(
            'suggested', -1).limit(per_page).skip((page - 1) * per_page)
        # shares = [fix_share(share) for share in shares]
        new_shares = []
        for share in shares:
            share = fix_share(share)
            user = User.by_sid(share.user_id)
            share = dict(share)
            share['user_name'] = user.user_name
            share['markdown'] = ''
            if read_status:
                share['read'] = bool(share['id'] in l_hitted_share_id)

            soup = BeautifulSoup(share['content'], "lxml")
            # kill all script and style elements
            for script in soup(["script", "style"]):
                script.extract()    # rip it out

            # get text
            text = soup.get_text()

            # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip()
                      for line in lines for phrase in line.split("  "))
            # drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)
            # print(text)
            share['summary'] = text[:150]
            share['content'] = ''

            if user.user_email.endswith('@wechat'):
                share['user_img'] = options.site_url + \
                    get_avatar_by_wechat(user._id)
            if user.user_email.endswith('@anwensf.com'):
                share['user_img'] = options.site_url + \
                    get_avatar_by_feed(user.id)
            else:
                share['user_img'] = options.site_url + \
                    get_avatar(user.user_email, 100)
            new_shares.append(share)

        # if tag:
        #     shares = [share for share in shares if tag in share['tags']]
        meta = {}
        meta['page'] = page
        meta['articleNumber'] = num_shares
        if meta_info and last_suggested:
            meta['number_of_update'] = number_of_update
        if meta_info and tag:
            d_tags = get_tags()
            # d_tags_parent = get_tags_parent()
            d_tags_parents = get_tags_parents()
            if tag in d_tags:
                sub_tags = []
                for name in d_tags[tag]:
                    num = Share.find({'tags': name}, {'_id': 0}).count()
                    num_recent = Share.find(
                        {'tags': name, 'published': {'$gt': time.time() - 86400 * 30}}, {'_id': 0}).count()
                    info = {}
                    info['name'] = name
                    info['num'] = num
                    info['num_recent'] = num_recent
                    sub_tags.append(info)
                meta['sub_tags'] = sub_tags
            meta['parent_tags'] = []
            if tag in d_tags_parents:
                # hypernym
                # meta['parent_tags'].append(d_tags_parent[tag])
                meta['parent_tags'] = d_tags_parents[tag]

        logger.info('last_suggested time: {}'.format(last_suggested))

        if new_shares:
            logger.info('new_shares[0] time: {}'.format(new_shares[0]['title']))
            logger.info('new_shares[0] published time: {}'.format(
                new_shares[0]['published']))
            logger.info('new_shares[0] suggested time: {}'.format(
                new_shares[0]['suggested']))

        self.res = {'articles': list(new_shares)}
        self.meta = meta
        # number=len(self.res)
        # number=number
        return self.write_json()
Esempio n. 21
0
def main():
    stc.html(HTML_BANNER)
    menu = ["Home", "MultiVerse", "About"]

    df = load_bible("data/KJV_Bible.csv")

    choice = st.sidebar.selectbox("Menu", menu)
    if choice == "Home":
        st.subheader("Single Verse Search")
        book_list = df["book"].unique().tolist()
        book_name = st.sidebar.selectbox("Book", book_list)
        chapter = st.sidebar.number_input("Chapter", 1)
        verse = st.sidebar.number_input("Verse", 1)
        bible_df = df[df["book"] == book_name]

        # Layout
        c1, c2 = st.beta_columns([2, 1])

        # Single Verse Layout
        with c1:

            try:
                selected_passage = bible_df[(bible_df["chapter"] == chapter)
                                            & (bible_df["verse"] == verse)]
                passage_details = "{} Chapter::{} Verse::{}".format(
                    book_name, chapter, verse)
                st.info(passage_details)
                passage = "{}".format(selected_passage["text"].values[0])
                st.write(passage)

            except:
                st.warning("Book out of Range")

        with c2:
            # st.success("Verse of the Day")
            chapter_list = range(10)
            verse_list = range(20)
            ch_choice = random.choice(chapter_list)
            vs_choice = random.choice(verse_list)
            random_book_name = random.choice(book_list)

            # st.write("Book:{},Ch:{},Vs:{}".format(random_book_name,ch_choice,vs_choice))
            rand_bible_df = df[df["book"] == random_book_name]

            try:
                randomly_selected_passage = rand_bible_df[
                    (rand_bible_df["chapter"] == ch_choice)
                    & (rand_bible_df["verse"] == vs_choice)]
                mytext = randomly_selected_passage["text"].values[0]
            except:
                mytext = rand_bible_df[(rand_bible_df["chapter"] == 1) & (
                    rand_bible_df["verse"] == 1)]["text"].values[0]

            stc.html(HTML_RANDOM_TEMPLATE.format(mytext), height=300)

        # Search Topic/Term
        search_term = st.text_input("Term/Topic")
        with st.beta_expander("View Results"):
            retrieved_df = df[df["text"].str.contains(search_term)]
            st.dataframe(retrieved_df[["book", "chapter", "verse", "text"]])

    elif choice == "MultiVerse":
        st.subheader("MultiVerse Retrieval")
        book_list = df["book"].unique().tolist()
        book_name = st.sidebar.selectbox("Book", book_list)
        chapter = st.sidebar.number_input("Chapter", 1)
        bible_df = df[df["book"] == book_name]
        all_verse = bible_df["verse"].unique().tolist()
        verse = st.sidebar.multiselect("Verse", all_verse, default=1)
        selected_passage = bible_df.iloc[verse]
        st.dataframe(selected_passage)
        passage_details = "{} Chapter::{} Verse::{}".format(
            book_name, chapter, verse)
        st.info(passage_details)

        # Layout
        col1, col2 = st.beta_columns(2)
        # Join all text as a sentence
        docx = " ".join(selected_passage["text"].tolist())

        with col1:
            st.info("Details")
            for i, row in selected_passage.iterrows():
                st.write(row["text"])

        with col2:
            st.success("StudyMode")
            with st.beta_expander("Visualize Entities"):
                # st.write(docx)
                render_entities(docx)

            with st.beta_expander("Visualize Pos Tags"):
                tagged_docx = get_tags(docx)
                processed_tags = mytag_visualizer(tagged_docx)
                # st.write(processed_tags)# Raw
                stc.html(processed_tags, height=1000, scrolling=True)

            with st.beta_expander("Keywords"):
                processed_docx = nfx.remove_stopwords(docx)
                keywords_tokens = get_most_common_tokens(processed_docx, 5)
                st.write(keywords_tokens)

            with st.beta_expander("Pos Tags Plot"):
                tagged_docx = get_tags(docx)
                tagged_df = pd.DataFrame(tagged_docx,
                                         columns=["Tokens", "Tags"])
                # st.dataframe(tagged_df)
                df_tag_count = tagged_df["Tags"].value_counts().to_frame(
                    "counts")
                df_tag_count["tag_type"] = df_tag_count.index
                # st.dataframe(df_tag_count)

                c = alt.Chart(df_tag_count).mark_bar().encode(x="tag_type",
                                                              y="counts")
                st.altair_chart(c, use_container_width=True)

        with st.beta_expander("Verse Curve"):
            plot_mendelhall_curve(docx)

        with st.beta_expander("Word Freq Plot"):
            plot_word_freq_with_altair(docx)

    else:
        st.subheader("About")
        st.text("Build with Streamlit")
Esempio n. 22
0
 def get(self):
     d_tags = get_tags()
     self.res = d_tags
     self.write_json()