コード例 #1
0
def update_author_networking_from_meta():
    start = 0
    db = Database()
    total = int(
        data_fetch("COUNT(*)", "essays", start=start, limit=None)[0][0])
    scan_count = start
    insert_count = 0
    limit = 10000

    while scan_count < total:

        if limit > total - scan_count:
            limit = total - scan_count

        data = data_fetch("*", "essays", start=start, limit=limit)

        for raw_item in data:
            obj = Essay(raw_item)
            response = obj.author_insert(db.conn)
            scan_count += 1
            if response:
                insert_count += response

            if scan_count % 1000 == 0:
                print(
                    "Scanned {} out of {} essays \n Inserted {} relation \n \n"
                    .format(scan_count, total, insert_count))
        start += limit
コード例 #2
0
def get_word_freq():
    with open("../../../data/nlp/stop_words.pickle", "rb") as file:
        stopwords = pickle.load(file)

    df = pd.read_excel("../../../data/output/关键词.xlsx")
    keywords = []

    for ind in df.keys():
        for word in df[ind]:
            if not pd.isna(word):
                keywords.append(str(word))

    with open("../../../data/output/keywords.txt", "w",
              encoding="utf8") as file:
        for words in keywords:
            file.write("{} 3 nt\n".format(words))

    jieba.load_userdict("../../../data/output/keywords.txt")

    frequency = []
    num = row_count("wechat_essays",
                    host_IP="192.168.164.11",
                    database="wechat_v1")
    n = 0
    limit = 1000
    while n < num:
        if num - n < limit:
            limit = num - n

        data = data_fetch("content",
                          "wechat_essays",
                          limit=limit,
                          start=n,
                          host_IP="192.168.164.11",
                          database="wechat_v1")
        for item in data:
            fre = {}
            cleaned = html_cleanup(item[0])
            seg = jieba.cut(cleaned)
            for word in seg:
                if word.replace(" ", "") == "":
                    pass
                else:
                    if word not in stopwords:
                        if word in fre.keys():
                            fre[word] += 1
                        else:
                            fre[word] = 1
            frequency.append(fre)
        n += limit
        print("=== Done {} rows".format(n))

    with open("../../../data/output/word_freq.pickle", "wb") as file:
        pickle.dump(frequency, file)

    return frequency
コード例 #3
0
def worker(w_id, start, end):
    with open("../../data/nlp/stop_words.pickle", "rb") as file:
        stopwords = pickle.load(file)

    df = pd.read_excel("../../data/output/关键词.xlsx")
    keywords = []

    for ind in df.keys():
        for word in df[ind]:
            if not pd.isna(word):
                keywords.append(str(word))

    with open("../../data/output/keywords.txt", "w", encoding="utf8") as file:
        for words in keywords:
            file.write("{} 3 nt\n".format(words))

    jieba.load_userdict("../../data/output/keywords.txt")

    n = start
    limit = 1000
    with open("../../data/output/train{}.dat".format(w_id),
              "w",
              encoding="utf-8") as f:
        while n < end:
            if end - n < limit:
                limit = end - n

            data = data_fetch("content",
                              "wechat_essays_v2",
                              limit=limit,
                              start=n,
                              host_IP="192.168.164.11",
                              database="wechat")
            for item in data:
                cleaned = html_cleanup(item[0])
                seg = jieba.cut(cleaned)
                output = ""
                for word in seg:
                    if word.replace(" ", "") == "":
                        pass
                    else:
                        if word not in stopwords:
                            output += word + " "
                f.write(output + "\n")
            n += limit
            print("id: {} === Done {} rows".format(id, n))
コード例 #4
0
ファイル: main.py プロジェクト: Askwitionary/processing
    def __init__(self):
        g = Networks()
        count = 0
        relationships = data_fetch("`id`, `media_id`, `src_media_id`",
                                   "media_media",
                                   limit=99999,
                                   host_IP="10.0.0.101",
                                   database="processing")
        for item in relationships:
            if item[1] == item[2]:
                count += 1
            else:

                media1 = Media(item[1])
                media2 = Media(item[2])
                g.add_media(media1)
                g.add_media(media2)
                g.add_relationship(Relationship(media1, media2, item[0]))
コード例 #5
0
ファイル: main.py プロジェクト: Askwitionary/processing
def ntwk_rating():
    """
    
    :return: 
    """

    try:
        data = json.loads(request.data)
        mids = data["medias"]

    # 处理报错,直接返回失败并在data中附带错误信息
    except Exception as e:
        return json.dumps({
            'code': 0,
            'msg': 'FAILURE',
            'data': {
                'error_msg': str(e)
            }
        })

    relations = data_fetch("")
コード例 #6
0
def worker(w_id, start, end):
    print("===============Process {} has started================".format(w_id))

    db = Database()
    total = end
    scan_count = 0
    limit = 10000
    chunk_size = end - start

    media_inserted = 0
    author_inserted = 0

    while scan_count < (end - start):

        if limit > total - scan_count - start:
            limit = total - scan_count - start

        data = data_fetch("*",
                          "essays",
                          start=start,
                          limit=limit,
                          tail_condition="ORDER BY `insert_time` DESC")

        for raw_item in data:
            obj = Essay(raw_item)
            media_count, author_count = obj.extractor_info_insert(db.conn)
            author_count += obj.meta_author_insert(db.conn)
            scan_count += 1
            if media_count + author_count:
                media_inserted += media_count
                author_inserted += author_count

            if scan_count % 1000 == 0:
                print(
                    "Process {} has scanned {} out of {} essays \n Inserted {} author relations \n Inserted {} media relations\n"
                    .format(w_id, scan_count, chunk_size, author_inserted,
                            media_inserted))
        start += limit
    print("===============Process {} has ended================".format(w_id))
コード例 #7
0
    '2462acae8ed692945720e6acf6825c56', 'c0b0b4d041856d52c0c4c97adf8a5985',
    'f365ff75ae3c3b39d7dd5f0b9335ed3a', 'd4fa139b651f9efc5b5298a58e89f0ee',
    '54bf7dea08ac923ad6cb9bf894d50013', '036519b7296d1e5b0273e354cd310c06',
    '138df8b28925617a759e425c63e7f642', '6d4c13d26a41d6fc6c57d5c2aa652616',
    '9a6079cfa40e89c923754d677d2a586e', '26eb934f3637b266c44f2b2f46f5df2a'
]

nick_list = [
    '北京女主', '马铃薯精英网', '毕节市银行卡协会', '一诺法鼎财税', '财经野史', '独区企业服务平台', '每日股市秘闻',
    '融邦投资', '营销案例分析', '大安热线'
]

data_from_media = data_fetch(
    "`media_id`, `media_nick`, `src_media_id`, `src_media_nick`, `type`, `essay_id`",
    "media_media",
    condition=build_condition("media_id", id_list, "OR"),
    limit=None,
    host_IP="10.0.0.101",
    database="processing")

data_from_src = data_fetch(
    "`media_id`, `media_nick`, `src_media_id`, `src_media_nick`, `type`, `essay_id`",
    "media_media",
    condition=build_condition("src_media_id", id_list, "OR"),
    limit=None,
    host_IP="10.0.0.101",
    database="processing")

if __name__ == "__main__":
    _ = 1
コード例 #8
0
from data_structure.graph.graph import Vertex, Edge, Graph
from utils.mysql import data_fetch


class Author(Vertex):
    def __init__(self, name, hobby):
        Vertex.__init__(self, name)
        self.hobby = hobby


dic = {}
data = data_fetch("wechat_name`, `meta_content",
                  "wechat_essays_v1",
                  limit=100000)
for item in data:
    wechat_name = item[0]
    content = item[1]
    entities = content.replace(":", " ").split(" ")

    for obj in entities:
        if "原创" in obj:
            entities.remove(obj)
        if "点击" in obj:
            entities.remove(obj)
    dic[wechat_name] = entities

# n = 0
# onehot = []
# for key in dic.keys():
#     n += len(dic[key])
#     onehot += dic[key]
コード例 #9
0
                            pass
                        else:
                            print(e)
                    except Exception as e:
                        print(e)
        return media_count, author_count


if __name__ == "__main__":
    _ = 1

    count = 0
    g = Networks()
    relationships = data_fetch("`id`, `media_id`, `src_media_id`",
                               "media_media",
                               limit=99999,
                               host_IP="10.0.0.101",
                               database="processing")
    for item in relationships:
        if item[1] == item[2]:
            count += 1
        else:

            media1 = Media(item[1])
            media2 = Media(item[2])
            g.add_media(media1)
            g.add_media(media2)
            g.add_relationship(Relationship(media1, media2, item[0]))

    conn_count = {}
    for item in g.vertices:
コード例 #10
0
ファイル: freq_gen_.py プロジェクト: Askwitionary/processing
def worker(w_id, start, end):
    print(
        "===================Process {} has Started==============".format(w_id))
    if w_id % 2 == 0:
        url = "http://192.168.164.15:49001/seg/s"
    else:
        url = "http://10.0.0.59:49001/seg/s"
    with open("../../../../data/nlp/stop_words.pickle", "rb") as file:
        stopwords = pickle.load(file)

    dic_path = "../../../../data/output/account_name_unique_jieba.txt"
    jieba.load_userdict(dic_path)

    n = start
    limit = min(end - start, 30000)

    count = 0
    tmp = 0
    cou = 0

    while n < end:
        title_whole = []
        content_whole = []
        if end - n < limit:
            limit = end - n

        data = data_fetch("`title`, `content`",
                          "essays",
                          host_IP="192.168.164.15",
                          user_name="raw",
                          password="******",
                          database="raw",
                          limit=limit,
                          start=start,
                          tail_condition="ORDER BY `update_time`")

        for item in data:
            title_dic = {}
            content_dic = {}
            title = item[0]
            content = item[1]
            if title is None:
                t_result = None
            else:
                try:
                    title = replace_punctuation(
                        html_cleanup(title).replace(" ", "").replace("\n", ""))
                    t_result = "/".join(jieba.cut(title))
                except Exception as e:
                    print(e)
                    t_result = None
                    time.sleep(1)

            if content is None:
                c_result = None
            else:
                try:
                    content = replace_punctuation(
                        html_cleanup(content).replace(" ",
                                                      "").replace("\n", ""))

                    c_result = "/".join(jieba.cut(content))

                except KeyError:
                    c_result = None
                    pass

                except Exception as e:
                    print(e)
                    c_result = None
                    time.sleep(1)

            if t_result is None:
                pass
            else:
                t_wordlist = t_result.split("/")
                for item in t_wordlist:
                    if len(item) > 0 and item != " ":

                        if item in stopwords:
                            pass
                        elif isPunctuation(item):
                            pass
                        else:
                            if item in title_dic.keys():
                                title_dic[item] += 1
                            else:
                                title_dic[item] = 1

            if c_result is None:
                pass
            else:
                c_wordlist = c_result.split("/")
                for item in c_wordlist:
                    if len(item) > 0 and item != " ":

                        if item in stopwords:
                            pass
                        else:
                            if item in content_dic.keys():
                                content_dic[item] += 1
                            else:
                                content_dic[item] = 1

            title_whole.append(title_dic)
            content_whole.append(content_dic)

            count += 1
            if count % 10000 == 0:
                with open(
                        "../../../../data/output/w_freq0/title/result{}-{}.pickle"
                        .format(w_id, cou), "wb") as f:
                    pickle.dump(title_whole, f)
                with open(
                        "../../../../data/output/w_freq0/content/result{}-{}.pickle"
                        .format(w_id, cou), "wb") as f:
                    pickle.dump(content_whole, f)
                print("Process {} has processed {} essays... \n".format(
                    w_id, count))
        n += limit
        cou += 1
        start += limit
    with open(
            "../../../../data/output/w_freq0/title/result{}[-1].pickle".format(
                w_id), "wb") as f:
        pickle.dump(title_whole, f)
    with open(
            "../../../../data/output/w_freq0/content/result{}[-1].pickle".
            format(w_id), "wb") as f:
        pickle.dump(content_whole, f)

    print("===================Process {} has ended==============".format(w_id))
コード例 #11
0
    def __init__(self, action, data):
        if action == "channel":
            self.uid = data[1]
            self.action = action
            self.time_occurred = data[5]
            channel_details = \
                data_fetch("`title`, `content`, `pid`, `id`, `show`", "biz_channels", "id={}".format(data[0]),
                           host_IP="192.168.164.11", user_name="dwapi", password="******",
                           database="dw_biz")[0]
            self.description = str({
                "id": data[2],
                "data": {
                    "title": channel_details[0],
                    "content": channel_details[1],
                    "pid": channel_details[2],
                    "id": channel_details[3],
                    "show": channel_details[4]
                }
            }).replace("'", '"')
            self.related_id = data[0]

        if action == "search":
            self.uid = data[0]
            self.action = action
            self.time_occurred = data[7]
            self.description = str({
                "content": data[1],
                "count": data[3],
                "type": data[4],
                "status": data[2],
                "id": data[5]
            }).replace("'", '"')
            self.related_id = data[1][:min(len(data[1]), 32)]

        if action == "topic":
            self.uid = data[1]
            self.action = action
            self.time_occurred = data[5]
            topic_details = data_fetch(
                "`title`, `content`, `id`, `sources`, `filters`, `fav_count`, `status`, `only_match_title`, `uid`",
                "biz_topics",
                "id={}".format(data[0]),
                host_IP="192.168.164.11",
                user_name="dwapi",
                password="******",
                database="dw_biz")[0]
            # print(topic_details)
            self.description = str({
                "id": data[3],
                "enable": str(data[2]),
                "data": {
                    "title": topic_details[0],
                    "content": topic_details[1],
                    "id": topic_details[2],
                    "sources": topic_details[2],
                    "filters": topic_details[2],
                    "fav_count": topic_details[2],
                    "status": topic_details[2],
                    "only_match_title": topic_details[2],
                    "uid": topic_details[2],
                }
            }).replace("'", '"')
            self.related_id = data[0]

        if action == "eclick":
            self.uid = data[0]
            self.action = action
            self.time_occurred = data[2]
            self.description = str({
                "outbound_time": data[3],
                "view_percent": data[4],
                "id": data[5]
            }).replace("'", '"')
            self.related_id = data[1]

        if action == "elike":
            self.uid = data[1]
            self.action = action
            self.time_occurred = data[4]
            self.description = str({"id": data[2]}).replace("'", '"')
            self.related_id = data[0]

        if action == "ecomment":
            self.uid = data[1]
            self.action = action
            self.time_occurred = data[9]
            self.description = str({
                "content": data[2],
                "to_uid": data[3],
                "f_id": data[4],
                "f_comment": data[5],
                "like_count": data[6],
                "status": data[7],
                "id": data[8]
            }).replace("'", '"')
            self.related_id = data[0]

        if action == "efav":
            self.uid = data[1]
            self.action = action
            self.time_occurred = data[5]
            self.description = str({
                "enable": data[2],
                "id": data[3]
            }).replace("'", '"')
            self.related_id = data[0]
コード例 #12
0
import pickle
from datetime import datetime

from pymysql import IntegrityError

from utils.mysql import data_fetch, Database
from utils.text_utilizer import get_md5

with open("../../../../data/szqj/medias.pickle", "rb") as f:
    known_medias = pickle.load(f)

data = data_fetch("`name`, `media_id`, `media_nick`, `platform_id`, `essay_id`, `essay_pubdate`", "author_media", host_IP="10.0.0.101", database="processing", limit=None)
connection = Database().conn
for item in data:
    platform_id = item[3]
    if platform_id == 1:
        platform = "WX"
        author_name = item[0]
        media_id = item[1]
        media_nick = item[2]
        essay_id = item[4]
        essay_pubdate = item[5]

        if author_name == media_nick:
            pass

        else:
            if author_name in known_medias:
                author_id = get_md5(platform + "-" + author_name)
                sql_cols = """`id`, `media_id`, `media_nick`, `platform_id`, `src_media_id`, `src_media_nick`, `type`, `essay_id`, `essay_pubdate`, `insert_time`"""
                sql_values = """VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');""".format(
コード例 #13
0
ファイル: tfidf_test.py プロジェクト: Askwitionary/processing
def worker(w_id, start, end):
    print(
        "===================Process {} has Started==============".format(w_id))

    with open("../../../../data/nlp/idf.pickle", "rb") as file:
        idf = pickle.load(file)
    connection = Database().conn
    n = start
    limit = min(end - start, 10000)

    dic_path = "../../../../data/output/account_name_unique_jieba.txt"
    jieba.load_userdict(dic_path)

    count = 1
    dup = 0
    print(end)
    while n < end:
        if end - n < limit:
            limit = end - n

        data = data_fetch("`id`, `content`, `pubdate`",
                          "essays",
                          host_IP="192.168.164.15",
                          user_name="raw",
                          password="******",
                          database="raw",
                          limit=limit,
                          start=start,
                          tail_condition="ORDER BY `insert_time`")

        for item in data:
            if item is None:
                pass
            else:
                essay_id = item[0]
                content = item[1]
                pubdate = item[2]

                if content is None:
                    pass
                else:
                    duplicated = 1
                    try:
                        duplicated = row_count(
                            "essay_keywords",
                            condition="`essay_id` = '{}'".format(essay_id),
                            host_IP="10.0.0.101",
                            database="processing") > 0
                    except Exception as e:
                        print(e)
                    if duplicated:
                        dup += 1
                    else:
                        result = tfidf(content, idf, method=1)[0]
                        sql_cols = """`essay_id`, `content`, `pubdate`, `insert_time`"""
                        sql_values = """VALUES ('{}', '{}', '{}', '{}');""".format(
                            essay_id, json.dumps(result, ensure_ascii=False),
                            pubdate,
                            datetime.now().replace(microsecond=0))
                        sql = """INSERT INTO `essay_keywords` ({}) {}""".format(
                            sql_cols, sql_values)
                        try:
                            with connection.cursor() as cur:
                                # print(sql)
                                cur.execute(sql)
                                connection.commit()
                                count += 1
                        except Exception as e:
                            print(e)
            if (count + dup) % 1000 == 0:
                print(
                    "Process {} has inserted {} essays, duplicated skipped {} \n"
                    .format(w_id, count, dup))
        n += limit
        start += limit
    print("===================Process {} has ended==============".format(w_id))
コード例 #14
0
ファイル: main.py プロジェクト: Askwitionary/processing
def essay_tfidf():
    """
    
    :return: 
    """

    try:
        data = json.loads(request.data)
        eid = data["essay_id"]
        limit = int(data["limit"])
        method = int(data["method"])
    except Exception as e:
        return json.dumps({
            'code': 0,
            'msg': 'FAILURE',
            'data': {
                'error_msg': str(e)
            }
        })

    if limit <= 50:
        try:
            data = data_fetch("`content`",
                              "essay_keywords",
                              database="processing",
                              host_IP="10.0.0.101",
                              condition="`essay_id`='{}'".format(eid),
                              user_name="lduan",
                              password="******")
            if len(data) == 0:
                pass
            else:
                content_dic = json.loads(data[0][0], encoding="utf8")
                ll = list(content_dic.items())
                ll.sort(key=operator.itemgetter(1), reverse=True)
                tops = ll[:min(limit, len(ll))]
                output = {}
                for item in tops:
                    output[item[0]] = item[1]
                return json.dumps({
                    'code': 1,
                    'msg': 'SUCCESS',
                    'data': output
                },
                                  ensure_ascii=False)
        except Exception as e:
            return json.dumps({
                'code': 0,
                'msg': 'FAILURE',
                'data': {
                    'error_msg': str(e)
                }
            })

    try:
        essay, pubdate = data_fetch("`content`, `pubdate`",
                                    "essays",
                                    condition="`id` = '{}'".format(eid),
                                    host_IP="192.168.164.15",
                                    user_name="raw",
                                    password="******",
                                    database="raw")[0]

        result, fifty = tfidf(essay, idf, limit, method=method)
    except Exception as e:
        return json.dumps({
            'code': 0,
            'msg': 'FAILURE',
            'data': {
                'error_msg': str(e)
            }
        })

    sql_cols = """`essay_id`, `content`, `pubdate`, `insert_time`"""
    sql_values = """VALUES ('{}', '{}', '{}', '{}');""".format(
        eid, json.dumps(fifty, ensure_ascii=False), pubdate,
        datetime.now().replace(microsecond=0))

    sql = """INSERT INTO `essay_keywords` ({}) {}""".format(
        sql_cols, sql_values)
    try:
        with connection.cursor() as cur:
            cur.execute(sql)
            connection.commit()

        return json.dumps({'code': 1, 'msg': 'SUCCESS', 'data': result})
    except IntegrityError as e:
        if e.args[0] == 1062:
            return json.dumps({'code': 1, 'msg': 'SUCCESS', 'data': result})
        else:
            raise
    except Exception as e:
        return json.dumps({
            'code': 0,
            'msg': 'FAILURE',
            'data': {
                'error_msg': str(e)
            }
        })
コード例 #15
0
ファイル: freq_gen.py プロジェクト: Askwitionary/processing
def worker(w_id, start, end):
    print(
        "===================Process {} has Started==============".format(w_id))
    if w_id % 2 == 0:
        url = "http://192.168.164.15:49001/seg/s"
    else:
        url = "http://10.0.0.59:49001/seg/s"
    with open("../../../../data/nlp/stop_words.pickle", "rb") as file:
        stopwords = pickle.load(file)

    n = start
    limit = min(end - start, 10000)

    title_whole = []
    content_whole = []
    count = 0
    tmp = 0

    while n < end:
        if end - n < limit:
            limit = end - n

        data = data_fetch("`title`, `content`",
                          "essays",
                          host_IP="192.168.164.15",
                          user_name="raw",
                          password="******",
                          database="raw",
                          limit=limit,
                          start=start,
                          tail_condition="ORDER BY `update_time`")

        for item in data:
            title_dic = {}
            content_dic = {}
            title = item[0]
            content = item[1]
            if title is None:
                t_result = None
            else:
                try:
                    title = html_cleanup(title).replace(" ",
                                                        "").replace("\n", "")
                    t_result = requests.post(url, data={
                        "_q": title
                    }).json()["data"]
                except Exception as e:
                    print(e)
                    t_result = None
                    time.sleep(1)

            if content is None:
                c_result = None
            else:
                try:
                    content = html_cleanup(content).replace(" ", "").replace(
                        "\n", "")
                    # if len(content) > tmp:
                    #     tmp = len(content)
                    #     print(len(content))
                    #     print(content)

                    if len(content) < 10000:
                        c_result = requests.post(url, data={
                            "_q": content
                        }).json()["data"]
                    else:
                        content_list = text_spliter(content)

                        reqtoolong = [
                            requests.post(url, data={
                                "_q": item
                            }).json()["data"] for item in content_list
                        ]

                        c_result = reqtoolong[0]
                        for evenmore in reqtoolong[1:]:
                            c_result = c_result + " " + evenmore

                except KeyError:
                    c_result = None
                    pass

                except Exception as e:
                    print(e)
                    c_result = None
                    time.sleep(1)

            if t_result is None:
                pass
            else:
                t_wordlist = t_result.split(" ")
                for item in t_wordlist:
                    if len(item) > 0:
                        # item_l = item.split("/")
                        # word = item_l[0]
                        # pos = item_l[1]
                        # if pos == "w":
                        #     pass
                        # else:
                        if item in stopwords:
                            pass
                        elif isPunctuation(item):
                            pass
                        else:
                            if item in title_dic.keys():
                                title_dic[item] += 1
                            else:
                                title_dic[item] = 1

            if c_result is None:
                pass
            else:
                c_wordlist = c_result[1:-1].split(" ")
                for item in c_wordlist:
                    if len(item) > 0:
                        # item_l = item.split("/")
                        # word = item_l[0]
                        # pos = item_l[1]
                        # if pos == "w":
                        #     pass
                        # else:
                        if item in stopwords:
                            pass
                        else:
                            if item in content_dic.keys():
                                content_dic[item] += 1
                            else:
                                content_dic[item] = 1

            title_whole.append(title_dic)
            content_whole.append(content_dic)

            count += 1
            if count % 1000 == 0:
                with open(
                        "../../../../data/output/w_freq/title/result{}.pickle".
                        format(w_id), "wb") as f:
                    pickle.dump(title_whole, f)
                with open(
                        "../../../../data/output/w_freq/content/result{}.pickle"
                        .format(w_id), "wb") as f:
                    pickle.dump(content_whole, f)
                print("Process {} has processed {} essays... \n".format(
                    w_id, count))
        n += limit
        start += limit
    with open(
            "../../../../data/output/w_freq/title/result{}.pickle".format(
                w_id), "wb") as f:
        pickle.dump(title_whole, f)
    with open(
            "../../../../data/output/w_freq/content/result{}.pickle".format(
                w_id), "wb") as f:
        pickle.dump(content_whole, f)

    print("===================Process {} has ended==============".format(w_id))
コード例 #16
0
import pickle
import re

from utils.mysql import data_fetch
from utils.read_txt import read_txt
from utils.text_cleaner import html_cleanup


content = ""
essay_id = ""

start = 0
limit = 5000

data = data_fetch("`title`, `meta_content`, `content`", "essays", limit=limit, start=start)

with open("../../../data/temp/essays_tmp.pickle", "wb") as f:
    pickle.dump(data, f)

with open("../../../data/temp/essays_tmp.pickle", "rb") as f:
    data = pickle.load(f)

keywords = read_txt("../../../data/nlp/essay_author/author_keywords.txt")

black_list = ["图片来源", "配图来源", "来源为网络", "数据来源", "请勿转载", "转载以及向", "来源为网络"]

count = 0
for content in data:
    title = content[0]
    meta_data = content[1]
    if content[2] is not None:
コード例 #17
0
                cur.execute(sql)
                connection.commit()
        except Exception as e:
            print(sql)
            print(e)


host_IP = "192.168.164.11"
db_name = "dw_biz"
uname = "dwapi"
pword = "api@szqj"

channel_data = data_fetch(
    "`channel_id`, `uid`,`id`, `insert_time`, `update_time`",
    "biz_user_channels",
    limit=None,
    host_IP=host_IP,
    user_name=uname,
    password=pword,
    database=db_name)
search_data = data_fetch(
    "`uid`, `content`, `status`, `result_count`, `type`, `id`, `insert_time`, `update_time`",
    "biz_search_records",
    limit=None,
    host_IP=host_IP,
    user_name=uname,
    password=pword,
    database=db_name)
topic_data = data_fetch(
    "`topic_id`, `uid`, `enable`, `id`, `insert_time`, `update_time`",
    "biz_topic_subscribers",
    limit=None,