Beispiel #1
0
def merge_from_others():
    # db_host = database.Mysql('root','990211','data_policy',use_flow=True)
    # db_host_r = database.Mysql('root', '990211', 'data_policy')
    db_sever_w = database.Mysql('root',
                                '990211',
                                'dbpolicy',
                                host='121.36.22.40')
    db_sever_r = database.Mysql('root',
                                '990211',
                                'dbpolicy',
                                host='121.36.22.40',
                                use_flow=True)
    i = 0
    for line in db_sever_r.select('data'):
        if (i % 10 == 0): print(i)
        i += 1
        if not line['gov']:
            gov = '人民政府'
        else:
            gov = line['gov']
        if line['date']:
            date = line['date']
        else:
            date = None
        insert_dict = {
            'code': line['code'],
            'province': line['province'],
            'city': line['city'],
            'title': line['title'],
            'gov': gov,
            'date': date,
            'sub_url': line['sub_url']
        }
        db_sever_w.insert_one('links', insert_dict)
Beispiel #2
0
def spider(url):
    info=[]
    dic={}
    dbinfo =[]
    mysql_s = database.Mysql()
    global video
    video_info=get_video_info(url)
    for i in video_info:
        real_url=get_video_true_url(i['display_url'])
        if real_url==None:
            break
        title=unicode(i['title'])
        dic['title']=title
        image_url=i['pc_image_url']
        dic['image_url']=image_url
        video_play_count=i['video_play_count']
        dic['play_count']=str(video_play_count)
        video_duration_format=i['video_duration_format']
        dic['ti']=video_duration_format
        dic['video_url']=real_url
        info.append(title)
        info.append(real_url)
        info.append(str(video_play_count))
        video.append(info)
        info=[]
        dbinfo.append(mysql_s.insertData("video", dic))
    log['video']=video
    log['dbinfo']=dbinfo
Beispiel #3
0
def update_from_local():
    db = database.Mysql('root', '990211', 'dbpolicy')
    with open('local_result.csv', 'r', encoding='utf-8-sig') as fr:
        csv_r = csv.reader(fr)
        for line in csv_r:
            insert_dict = {
                'code': line[0],
                'province': line[1],
                'city': line[2],
                'gov': line[3],
                'title': line[4],
                'date': line[5],
                'sub_url': line[6]
            }
            response = Request(insert_dict['sub_url']).text
            if not response:
                continue
            else:
                task = parse_context.MAIN_TEXT(insert_dict['sub_url'],
                                               response)
                try:
                    result = task.main()
                    insert_dict['main_text'] = result['content']
                    insert_dict['attachment'] = ','.join(result['attachment'])
                    insert_dict['img'] = ','.join(result['img'])
                except Exception:
                    continue
            db.insert_one('data', insert_dict)
Beispiel #4
0
def migrations():
    db = database.Mysql('root', '990211', 'dbpolicy_web', host='121.36.22.40')
    with open('./data/configure_gov_v2.csv', 'r', encoding='utf-8') as fr:
        csv_r = csv.DictReader(fr)
        id = 0
        for line in csv_r:
            id += 1
            loc_id = db.select('api_location',
                               "id",
                               "code={}".format(line['\ufeffcode']),
                               fetch_one=True)['id']
            insert_config = {
                "id": id,
                'gov': line["gov"],
                'target_url': line['target_url'],
                'is_active': test_url(line['target_url']),
                "item_pattern": line["item_pattern"],
                "main_text_pattern": line["main_text_pattern"],
                "date_pattern": line['date_type'],
                "zupei_pattern": line['zupei_type'],
                "source_pattern": line['source'],
                'title_pattern': line['title'],
                'next_pattern': line['next_button'],
                'action_pattern': line['action'],
                'loc_id': loc_id,
                'file_count': 0,
                'author_id': 1,
            }
            res = db.insert_one('api_config', insert_config)
            if not res:
                id -= 1
def insertImagedb(images):
    mysql_s = database.Mysql()
    for image in images:
        print(image)
        mydict = {}
        mydict['title'] = image[0]
        mydict['image_url'] = image[1]
        mydict['url'] = image[2]
        print(mydict)
        mysql_s.insertData("tb_images", mydict)
Beispiel #6
0
def insertNewstoDB():
    mysql_s = database.Mysql()
    data=getTopNewsimg()
    send_email.send(str(data).replace('u\'', '\'').decode("unicode-escape"))
    for news in data:
        print(news)
        mydict={}
        mydict['title']=news[1]
        mydict['content'] = news[4]
        mydict['url'] = news[2]
        mydict['image_url'] = news[3]
        print(mydict)
        mysql_s.insertData("tb_news",mydict)
Beispiel #7
0
def get_from_local():
    db = database.Mysql('root', '990211', 'data_policy')
    matrix = db.select('data')
    for line in matrix:
        code = line[0]
        result = db.select('map_location',
                           condition='code="{}"'.format(code),
                           fetch_one=True)
        province = result[1]
        city = result[2]
        insert_list = [code, province, city] + list(line)[1:]
        with open('local_result.csv', 'a', encoding='utf-8-sig',
                  newline='') as fa:
            csv_a = csv.writer(fa)
            csv_a.writerow(insert_list)
Beispiel #8
0
def get_video_url_range(begin, end):
    text = []
    table = "budejie_copy"
    mysql_s = database.Mysql()
    sortcontent = sort_content_tags.SortContentByTags()
    for i in range(begin, end):
        url = 'http://www.budejie.com/' + str(i)
        try:
            req = urllib2.Request(url)  #请求页面 # 加一个键值对,针对服务器的反爬机制
            req.add_header(
                "User-Agent",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
            )
            html = urllib2.urlopen(req).read()
            soup = BeautifulSoup(html, 'html.parser')
            video_content = soup.find_all('div',
                                          class_=re.compile('j-video-c'))
            video_url = soup.find_all('div', class_=re.compile('^j-video$'))
            print(len(video_content))
            dic = {}
            le = len(video_content)
            if le != 0:
                for i in video_url:
                    dic['video_url'] = str(i['data-mp4']).strip()
                    dic['image_url'] = str(i['data-poster']).strip()
                    dic['video_id'] = str(i['data-id']).strip()
                    dic['title'] = " "
                    dic['ti'] = " "
                    dic['public_date'] = " "
                    dbinfo = mysql_s.insertData(table, dic)
                    text.append(dbinfo)
                    print(dbinfo)
                dic = {}
                for i in video_content:
                    dic['video_id'] = str(i['data-id']).strip()
                    dic['public_date'] = str(i['data-date']).strip()
                    dic['title'] = str(i['data-title']).strip()
                    print(dic['title'])
                    dic['ti'] = str(i['data-time']).strip()
                    text.append(dbinfo)
                    dic['tag_id'] = sortcontent.gettagbytitle(dic['title'])
                    dbinfo = mysql_s.upData1(table, dic)
        except Exception as e:
            print("No Web Pages")
        return text
Beispiel #9
0
def search():
    res_dict = list()
    db = database.Mysql('root',
                        '990211',
                        'dbpolicy',
                        host='121.36.22.40',
                        use_flow=True)
    for policy in db.select('data'):
        title = policy['title']
        avi_label = is_satisfied(title)
        if avi_label:
            label = ' '.join(avi_label)
            policy['label'] = label
            res_dict.append(policy)
    with open('data_with_label.csv', 'w', encoding='utf-8-sig',
              newline='') as fw:
        c_w = csv.DictWriter(fw, list(res_dict[0].keys()))
        c_w.writeheader()
        c_w.writerows(res_dict)
Beispiel #10
0
sys.setdefaultencoding('utf-8')
import re
import ast
import requests
from bs4 import BeautifulSoup
import database
import time
import multiprocessing

head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"}
db = database.Mysql(
    '127.0.0.1',
    'root',
    'Mroot@123',
    3306,
    'mayi_news',
    'utf8')[0]


def get_soup(url):
    data = requests.get(url, headers=head).content
    soup = BeautifulSoup(data, 'lxml')
    return soup


def verify_url(news_url):
    sql = "select url from news_tb where url='%s';" % news_url
    result = db.select_data(sql)
    if result == ():
class SortContentByTags:
    mysql_s=database.Mysql()
    tag_videos = [u"电影", u'视频', u'片', u'电视']  # 1002
    tag_society = [u'社会', u'交通', u'国家', u'人']  # 1003
    tag_entertain = [u'娱乐', u'休闲', u'乐']  # 1004
    tag_technology = [u'科技', u'数据', u'智能', u'科']  # 1005
    tag_car = [u'车', u'摩托', u'高铁']  # 1006
    tag_sports = [u'运动', u'体育', u'跑', u'跳', u'走']  # 1007
    tag_finance = [u'金融', u'钱', u'经济']  # 1008
    tag_military = [u'军', u'装备', u'机']  # 1009
    tag_global = [u'全球', u'世界', u'组织']  # 1010
    tag_episode = [u'片段', u'短片', u'集']  # 1011
    tag_funny_news = [u'笑', u'逗', u'哈']  # 1012
    tag_health = [u'健康', u'养生', u'保健', u'食物', u'医', u'药', u'病']  # 1013
    tag_shortvideo = [u'短视频', u'抖音', u'片段']  # 1014
    tag_animal = [ u'动物',u'狗', u'猫', u'宠物']  # 1015
    tag_education = [u'教育', u'学', u'生']  # 1016
    tag_shopping = [u'购物', u'天猫', u'京东', u'网购', u'淘宝', u'支付宝']  # 1017
    tag_music = [u'音', u'唱片', u'歌', u'声']  # 1018
    tags_mapping = {'1002': tag_videos, '1003': tag_society, '1004': tag_entertain, '1005': tag_technology
        , '1006': tag_car, '1007': tag_sports, '1008': tag_finance, '1009': tag_military, '1010': tag_global
        , '1011': tag_episode, '1012': tag_funny_news, '1013': tag_health, '1014': tag_shortvideo,
                    '1015': tag_animal, '1016': tag_education, '1017': tag_shopping, '1018':tag_music}
    def __init__(self):

        print("初始化......")
    def gettagbytitle(self,title):
        for key in self.tags_mapping:
            for tag in self.tags_mapping[key]:
            #print(title+" 正在和标签库匹配"+self.tags_mapping[key].__str__())
                if title.__contains__(tag):
                    print(title+"匹配成功"+self.tags_mapping[key].__str__()+"tagid: "+key )
                    return key
        print(title + "匹配默认" +"tagid: 1001" )
        return '1001'
    def printf(self,title_map_list):
        for key in title_map_list:
            print("标题为"+key)
            print ("该标题所匹配的标签为"+set(title_map_list[key]).__str__())
    def querybycontent_tagset(self,content):
        tag_set=set()
        for key in self.tags_mapping:
            for tag in self.tags_mapping[key]:
                if content.__contains__(tag):
                    print(content + "匹配成功" + self.tags_mapping[key].__str__() + "tagid: " + key)
                    tag_set.add(key)
        tag_set.add('1001')
        return tag_set
    def addtags_2users_by_analyse_comment(self):
        sql="select user_id,content from tb_review_content"
        db_list=self.mysql_s.getData(sql)
        table="tb_matching"
        for i in db_list:
            print(i[1])
            print(self.querybycontent_tagset(i[1]))

            my_dict=list(self.querybycontent_tagset(i[1]))
            if len(my_dict)<6:
                for l in range(0,6):
                    my_dict.append('')
            sql = "update %s set tag_id1 = %s ,tag_id2=%s,tag_id3=%s ,tag_id4=%s,tag_id5=%s,tag_id6=%s where user_id=%s" % (
            table, '"' + my_dict[0] + '"', '"' + my_dict[1] + '"', '"' + my_dict[2] + '"',
            '"'+ my_dict[3] + '"', '"' + my_dict[4] + '"', '"' + my_dict[5] + '"', '"'+i[0] + '"')
            self.mysql_s.upData(my_dict,sql)
Beispiel #12
0
def init_mysql(machine_name="dbpolicy", use_flow=False):
    return database.Mysql(MYSQL["user"],
                          MYSQL["password"],
                          MYSQL["database"],
                          host=MYSQL[machine_name],
                          use_flow=use_flow)
Beispiel #13
0
                "source_pattern": line['source'],
                'title_pattern': line['title'],
                'next_pattern': line['next_button'],
                'action_pattern': line['action'],
                'loc_id': loc_id,
                'file_count': 0,
                'author_id': 1,
            }
            res = db.insert_one('api_config', insert_config)
            if not res:
                id -= 1


if __name__ == '__main__':
    # search()
    # merge_from_others()
    # migrations()
    # get_from_local()
    # update_from_local()
    # migrant()
    db = database.Mysql('root', '990211', 'dbpolicy_web', host='121.36.22.40')
    for line in db.select('api_config', condition="is_active=0"):
        url = line['target_url']
        response = Request(url, timeout=10).text
        if response:
            print(line)
            db.update(
                'update api_config set is_active=1 where id = "{}"'.format(
                    line['id']))
        else:
            db.delete('api_config', 'id="{}"'.format(line['id']))