Ejemplo n.º 1
0
def starts():
    url = "http://www.zhilianfm.com/zlfmCms/"
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        # 获取文章的编号
        number = getUrl(reponse)
        while True:
            if rechecking(number, come_from="zhilianfm_alerts"):
                break
            data = download(number)
            if data:
                break
            number -= 1
Ejemplo n.º 2
0
def getUrl(reponse, url):
    url_news = url
    html = reponse.text
    # 进行筛选
    pattern = re.compile('/[a-z]*?/\d+\.html')
    urls = re.findall(pattern, html)
    urls = list(set(urls))
    for i in urls:
        pattern = re.compile('\d+')
        number = re.findall(pattern, i)[0]
        if rechecking(number, come_from="bibaodao") or int(number) < 1000:
            break
        url = url_news + i
        download(number, url)
        break
Ejemplo n.º 3
0
def getUrl(html):
    # 从中获取新闻的网址
    urls = html.xpath('//a/@href')
    for url in urls:
        # 进行判断是否是正确网址
        pattern = re.compile('(/[\s\S]*?/)(\d+)(#commentBox)')
        url_num = re.findall(pattern, url)
        if url_num:
            number = url_num[0][1]
            if rechecking(number, come_from="lianshijie7234"):
                return True
            url_data = url_num[0][0] + url_num[0][1]
            data = connect(url_data, number)
            if data == "end":
                return True
Ejemplo n.º 4
0
def getUrl(reponse):
    html = reponse.text
    pattern = re.compile('/[^\s]*\.html')
    urls = re.findall(pattern, html)
    urls = list(set(urls))
    for i in urls:
        url = "http://shilian.com" + i
        pattern_num = re.compile('\d+')
        num = re.findall(pattern_num, url)
        number = ""
        for i in num:
            number += i
        if rechecking(number, come_from="shilian"):
            break
        download(url, number)
Ejemplo n.º 5
0
def download(reponse):
    print("coingogo_alerts")
    html = reponse.text
    data = json.loads(html)
    texts = data["list"]
    for text in texts:
        number = text["id"]
        if rechecking(number, come_from="coingogo_alerts"):
            return True
        createtime = text["createtime"]
        timeout = str(createtime["year"]) + "年" + str(
            createtime["mon"]) + "月" + str(createtime["mday"]) + "日  " + str(
                createtime["hours"]) + ":" + str(
                    createtime["minutes"]) + "  " + createtime["weekday"]
        storage(number, timeout, text)
Ejemplo n.º 6
0
def download(reponse):
    print("bishequ_alerts")
    html = reponse.text
    texts = json.loads(html)
    data = texts["newsList"]
    for text in data:
        number = text["id"]
        if rechecking(number, come_from="bishequ_alerts"):
            return True
        content_text = etree.HTML(text["content"])
        content = content_text.xpath('//p/text()')
        if not content:
            content = content_text.xpath('//p/span/text()')
        timeout = time.asctime(time.localtime(int(text["createTime"])/1000))
        storage(text, content, timeout)
Ejemplo n.º 7
0
def findNumber(html):
    html = json.loads(html)
    data = html["data"]
    reload = 0
    for text in data:
        number = text["fcNewsId"]
        if rechecking(number, come_from="huolian"):
            if reload == 3:
                return True
            else:
                reload += 1
                continue
        data = connent(number)
        if data:
            return True
Ejemplo n.º 8
0
def getUrl(reponse):
    # 获取新闻的网址
    pattern = re.compile('[a-zA-z]+://[^\s]*\.html')
    urls = re.findall(pattern, reponse.text)
    urls = list(set(urls))
    for url in urls:
        pattern_live = re.compile("live")
        num = re.findall(pattern_live, url)
        if not len(num):
            # 获取新闻所有的编号
            pattern_number = re.compile("\d+")
            number = re.findall(pattern_number, url)[0]
            # 判断在数据库是否已经下载过
            if rechecking(number, come_from="bitrating"):
                break
            download(number, url)
Ejemplo n.º 9
0
def getUrl(reponse):
    print("polo321")
    # 获取排列新闻的信息
    html = reponse.text
    texts = json.loads(html)
    data = texts["data"]
    # 获取其中信息
    texts = data["list"]
    # 判断是否获取到信息,如果没有则说明已经超出翻页范围,则结束翻页获取信息,
    if not texts:
        return True
    for text in texts:
        # 分离出编号,以编号去获取具体新闻内容
        number = text["id"]
        if rechecking(number, come_from="polo321"):
            return True
        download(number)
Ejemplo n.º 10
0
def getUrl(reponse):
    html = reponse.text
    pattern = re.compile('/Content/[^\s]*?data=[^\s]*?__2C__2C')
    urls = re.findall(pattern, html)
    urls = list(set(urls))
    for i in urls:
        pattern = re.compile('video')
        video = re.findall(pattern, i)
        if video:
            continue
        url = "https://ihuoqiu.com" + i
        pattern = re.compile('(/Content/[^\s]*?data=)([^\s]*?__2C__2C)')
        num = re.findall(pattern, url)[0]
        number = num[1]
        if rechecking(number, come_from="ihuoqiu"):
            continue
        download(url, number)
Ejemplo n.º 11
0
def download(reponse, url):
    try:
        print("btc123_alerts")
        html = reponse.text
        # 将文档的json转换为字典
        text = json.loads(html)
        data = text["data"]
        for findOne in data:
            number = findOne["id"]
            if rechecking(number, "btc123_alerts"):
                return True
            # 获取更精确的时间
            timeout = findOne["createText"]
            release_time = UTCTime(timeout)
            storage(findOne, release_time)
    except Exception as err:
        mistake(url, err)
Ejemplo n.º 12
0
def download(text):
    try:
        print("coinvoice_alerts")
        text = etree.HTML(text)
        number = text.xpath('//div[@class="date"]/@data-time')[0]
        if rechecking(number, come_from="coinvoice_alerts"):
            return True
        title = text.xpath('//div[@class="title"]/text()')[0]
        # 时间不够精确
        timeout = text.xpath('//div[@class="date"]/text()')[0]
        timeout_new = UTCTime(timeout)
        timeout = timeout_new + " --- " + timeout + "前左右"
        # 获取正文
        main_text = text.xpath('//div[@class="summary"]/text()')[0]
        storage(number, title, timeout, main_text)
    except Exception as err:
        mistake(url="http://www.coinvoice.cn/category/kuaixun", err=err)
Ejemplo n.º 13
0
def getUrl(news):
    for new in news:
        url = "http://youjiatuanjian.com" + new
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        html = etree.HTML(reponse.text)
        if reponse.status_code == 200:
            # 获取编号
            pattern_num = re.compile('\d+')
            number = re.findall(pattern_num, url)[0]
            # 判断数据库中是否已经下载过
            if rechecking(number, come_from="youjiatuanjian"):
                return
            download(html, number, url)
        else:
            err = reponse.status_code
            mistake(url, err)
Ejemplo n.º 14
0
def starts():
    url = "http://longkuai.com/"
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        # 获取文章的编号
        number = getUrl(reponse)
        while True:
            if rechecking(number, come_from="longkuai"):
                break
            data = connent(number)
            if data:
                break
            number -= 1
    else:
        err = reponse.status_code
        mistake(url, err)
Ejemplo n.º 15
0
def connect(number):
    url = "http://www.bikuai.org/news/%s.html" % number
    while True:
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            html = etree.HTML(reponse.text)
            download(html, number)
            data = downURL(html)
            url = data
            pattern_num = re.compile('\d+')
            number = int(re.findall(pattern_num, url)[0])
            if rechecking(number, come_from="bikuai"):
                break
        else:
            err = reponse.status_code
            mistake(url, err)
            break
Ejemplo n.º 16
0
def download(html, url):
    try:
        print("huoxing24")
        # 获取编号
        pattern_num = re.compile('\d+')
        number = re.findall(pattern_num, url)[1]
        # 判断数据库中是否已经下载过
        if rechecking(number, come_from="huoxing24"):
            return
        # 匹配发布时间
        pattern_time = re.compile('([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))')
        time = re.findall(pattern_time, html)[0]
        # print(time)
        # 匹配标题
        pattern_title = re.compile('<h1 style[\s\S]*?</h1>')
        titles = re.findall(pattern_title, html)[0]
        title = titles.split()[-2]
        # print(title)
        # 匹配副标题
        pattern_subhead = re.compile('<h2>[\s\S]*?</h2>')
        fu_title = re.findall(pattern_subhead, html)[0]
        subhead = fu_title[4: -5]
        # print(fu_title)
        # 匹配文本的来源
        pattern_source = re.compile('本文来源: <span>[\s\S]*?</span>')
        sources = re.findall(pattern_source, html)[0]
        pattern = re.compile('>[\s\S]*?<')
        source = re.findall(pattern, sources)[0][1: -1] + "--" + url
        # print(source)
        # 匹配作者
        pattern_authors = re.compile('<p class="author">[\s\S]*?</p>')
        authors = re.findall(pattern_authors, html)[0]
        pattern_author = re.compile('[\u4e00-\u9fa5]+')
        author = re.findall(pattern_author, authors)[0]
        # print(author)
        # 匹配新闻信息
        down_page = etree.HTML(html)
        texts = down_page.xpath('//div[@class=""]')[0]
        text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()
        # print(text)
        # 进行存储信息
        storage(title, author, subhead, time, source, text, number)
    except Exception as err:
        mistake(url, err)
Ejemplo n.º 17
0
def connect(url, reload):
    # 进行连接
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf8"
    if reponse.status_code == 200:
        pattern_num = re.compile("\d+")
        number = re.findall(pattern_num, url)[0]
        if rechecking(number, come_from="zaoping"):
            # 新闻信息没有顺序规律,进行3次判断,如果连续3次就结束运行
            if reload == 3:
                return "end"
            else:
                reload += 1
                return "continue"
        print("zaoping")
        download(reponse, number)
    else:
        err = reponse.status_code
        mistake(url, err)
        return "end"
Ejemplo n.º 18
0
def getUrl(html):
    urls = html.xpath(
        '/html/body/section/div[1]/div/article/header/h2/a/@href')
    # 获取文章对应的作者
    authors = html.xpath(
        '/html/body/section/div[1]/div/article/p[1]/span[1]/text()')
    for url, author in zip(urls, authors):
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        # 文章对应的编号
        pattern = re.compile('\d+')
        number = re.findall(pattern, url)[0]
        if rechecking(number, come_from="epcnn"):
            return True
        if reponse.status_code == 200:
            html = etree.HTML(reponse.text)
            download(html, author, number, url)
        else:
            err = reponse.status_code
            mistake(url, err)
Ejemplo n.º 19
0
def download(url, html):
    try:
        # 匹配快讯发布时间
        time = html.xpath(
            '//*[@id="kuaixun-wrap"]/div/div[1]/div[1]/text()')[0]
        texts = html.xpath('//*[@id="view"]/li')
        n = 1
        for text in texts:
            # 获取每条快讯的唯一编号
            num = html.xpath(
                '//*[@id="view"]/li[%s]/a/div[1]/div[2]/@onclick' % n)
            pattern = re.compile('\d+')
            number = re.findall(pattern, num[0])[0]
            # 判断数据库中是否已经下载过
            if rechecking(number, come_from="youjiatuanjian_alerts"):
                return
            downloadOneMessage(text, time, url, number)
            n += 1
    except Exception as err:
        mistake(url, err)
Ejemplo n.º 20
0
def download(reponse, url):
    try:
        print("weilaicaijing_alerts")
        html = reponse.text
        # 将文档的json转换为字典
        text = json.loads(html)
        data = text["data"][0]
        timeout = data["time"]
        kuaixun_list = data["list"]
        for findOne in kuaixun_list:
            number = findOne["id"]
            if rechecking(number, "weilaicaijing_alerts"):
                return True
            time_hour = findOne["hour"]
            release_time = timeout + "  " + time_hour
            pattern = re.compile("【[\s\S]*?】")
            title = re.findall(pattern, findOne["text"])[0]
            storage(findOne, release_time, number, title)
    except Exception as err:
        mistake(url, err)
Ejemplo n.º 21
0
def connect(url):
    # 循环获取网页
    while True:
        pattern = re.compile('\d+')
        number = re.findall(pattern, url)[0]
        if rechecking(number, come_from="daoqm"):
            break
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            html = etree.HTML(reponse.text)
            data = download(html, number)
            if data:
                break
            data = downURL(html)
            url = data
        else:
            err = reponse.status_code
            mistake(url, err)
            break
Ejemplo n.º 22
0
def connent(number):
    # 连接网址,循环进行获取数据
    url = "http://www.haitunbc.com/page68.html?article_id=%s" % number
    while True:
        pattern = re.compile('\d+')
        number = re.findall(pattern, url)[1]
        if rechecking(number, come_from="haitunbc"):
            break
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            download(reponse, number, url)
            data = gainBelowUrl(reponse)
            if data == "end":
                break
            url = data
        else:
            err = reponse.status_code
            mistake(url, err)
            break
Ejemplo n.º 23
0
def connect(number):
    while True:
        # 循环获取信息
        try:
            url = "http://www.btc798.com/articles/%s.html" % number
            if rechecking(number, come_from="btc798"):
                break
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
        except TimeoutError:
            time.sleep(10)
            continue
        if reponse.status_code == 200:
            html = etree.HTML(reponse.text)
            download(html, number)
            data = downURL(html)
            number = data
        else:
            err = reponse.status_code
            mistake(url, err)
            break
Ejemplo n.º 24
0
def getUrl(html):
    # 拆分获取数据
    pattern = re.compile('<div class="desc">[\s\S]*?</time>')
    texts = re.findall(pattern, html)
    for text in texts:
        # 得到URL以及发布时间
        pattern_url = re.compile('[a-zA-z]+://[^\s]*\.html')
        url = re.findall(pattern_url, text)[0]
        pattern_num = re.compile('\d+')
        number = int(re.findall(pattern_num, url)[0])
        if rechecking(number, come_from="budkr"):
            break
        pattern_time = re.compile('\d+-\d+-\d+ \d+:\d+')
        timeout = re.findall(pattern_time, text)[0]
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            download(reponse, timeout, number)
        else:
            err = reponse.status_code
            mistake(url, err)
Ejemplo n.º 25
0
def starts():
    reload = 0
    url = "http://www.leilook.com/"
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        # 获取文章的编号
        number = getUrl(reponse)
        while True:
            if rechecking(number, come_from="leilook"):
                break
            data = connent(number, reload)
            if data == "pictrue":
                number -= 1
                continue
            elif data == "over":
                break
            number -= 1
    else:
        err = reponse.status_code
        mistake(url, err)
Ejemplo n.º 26
0
def getUrl(reponse):
    html = reponse.text
    pattern = re.compile('/news/\d+')
    urls = re.findall(pattern, html)
    urls = list(set(urls))
    for i in urls:
        url = "http://www.coingogo.com" + i
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        pattern = re.compile('\d+')
        # 获取信息的编号
        number = re.findall(pattern, url)[0]
        if rechecking(number, come_from="coingogo"):
            continue
        if reponse.status_code == 200:
            data = download(reponse, number, url)
            if data:
                continue
        else:
            err = reponse.status_code
            mistake(url, err)
Ejemplo n.º 27
0
def download(reponse_branch, url, branch):
    try:
        print("hecaijiing")
        # 获取编号
        pattern = re.compile('\d+')
        number = re.findall(pattern, url)[0]
        # 判断数据库中是否已经下载过
        if rechecking(number, come_from="hecaijing"):
            return
        html = etree.HTML(reponse_branch.text)
        # 获取标题、时间、作者、来源、内容
        title = html.xpath('/html/body/div[5]/div[1]/h1/text()')[0]
        author_compile = html.xpath('/html/body/div[5]/div[1]/p[2]/span[1]/text()')[0]
        author_name = html.xpath('/html/body/div[5]/div[1]/p[2]/span[2]/text()')[0].split()[0]
        author = author_compile+author_name
        times = html.xpath('/html/body/div[5]/div[1]/p[1]/text()')[0].split()
        time = times[1]
        source = ("核财经-%s:" % branch) + url + "--" + times[0]
        main_text = html.xpath('/html/body/div[5]/div[1]/div[3]')[0]
        text = etree.tostring(main_text, method="text", encoding="utf8").decode("utf8").split()
        storage(number, title, author, time, source, text)
    except Exception as err:
        mistake(url, err)
Ejemplo n.º 28
0
def download(html, time, n, text):
    try:
        print("fn_alerts")
        # 获取编号
        source_url = html.xpath(
            '//*[@id="wrap"]/div/div/div/div[2]/div[%s]/div[1]/h2/a/@href' %
            n)[0]
        pattern_num = re.compile('\d+')
        number = re.findall(pattern_num, source_url)[0]
        # 判断数据库中是否已经下载过
        if rechecking(number, come_from="fn_alerts"):
            return
        # 获取文本
        main_text = etree.tostring(text, method="text",
                                   encoding="utf8").decode("utf8").split()
        n += 1
        # 获取标题、时间、来源、内容
        title = main_text[1]
        timeout = time + " " + main_text[0]
        mains = main_text[2:-8]
        source = "FN资讯:" + source_url
        storage(number, title, timeout, source, mains)
    except Exception as err:
        mistake(text, err)
Ejemplo n.º 29
0
def download(reponse):
    try:
        print("babifinance")
        html = reponse.text
        texts = json.loads(html)
        for text in texts:
            number = text["id"]
            if rechecking(number, come_from="babifinance"):
                return True
            # 分离出所需要的信息
            data = text["content"].split()
            pattern = re.compile("来源")
            exist = re.findall(pattern, data[0])
            if exist:
                source = data[0]
                reload = -1
                while True:
                    statement = data[reload]
                    if statement != "&nbsp;":
                        statement = data[reload]
                        pattern = re.compile("作者")
                        exist = re.findall(pattern, data[0])
                        if exist:
                            author = data[0]
                            main = data[1: reload]
                            storage(author, source, statement, main, text)
                            break
                        else:
                            reload -= 1
                            author = data[reload]
                            while True:
                                if author != "&nbsp":
                                    pattern = re.compile('编辑|作者')
                                    exist = re.findall(pattern, author)
                                    if exist:
                                        if len(author) < 30:
                                            main = data[1: reload]
                                            storage(author, source, statement, main, text)
                                            break
                                        else:
                                            reload += 1
                                            author = "BABI财经"
                                            main = data[1: reload]
                                            storage(author, source, statement, main, text)
                                            break
                                    else:
                                        author = "BABI财经"
                                        reload += 1
                                        main = data[1: reload]
                                        storage(author, source, statement, main, text)
                                        break
                                else:
                                    reload -= 1
                            break
                    else:
                        reload -= 1
            else:
                source = "BABI财经"
                reload = -1
                while True:
                    statement = data[reload]
                    if statement != "&nbsp;":
                        statement = data[reload]
                        pattern = re.compile("作者")
                        exist = re.findall(pattern, data[0])
                        if exist:
                            author = exist[0]
                            main = data[1: reload]
                            storage(author, source, statement, main, text)
                            break
                        else:
                            reload -= 1
                            author = data[reload]
                            while True:
                                if author != "&nbsp":
                                    pattern = re.compile('编辑|作者')
                                    exist = re.findall(pattern, author)
                                    if exist:
                                        if len(author) < 30:
                                            main = data[1: reload]
                                            storage(author, source, statement, main, text)
                                            break
                                        else:
                                            reload += 1
                                            author = "BABI财经"
                                            main = data[1: reload]
                                            storage(author, source, statement, main, text)
                                            break
                                    else:
                                        author = "BABI财经"
                                        reload += 1
                                        main = data[: reload]
                                        storage(author, source, statement, main, text)
                                        break
                                else:
                                    reload -= 1
                            break
                    else:
                        reload -= 1
    except Exception as err:
        mistake(url="http://www.babifinance.com/", err=err)