Esempio n. 1
0
def index():
    return render_template(
        'index.html',
        variable1=get_url()[0],
        variable2=get_url()[1],
        variable3=get_url()[2],
    )
def get_page(url):
    html = gu.get_url(url).decode('utf-8')

    a = html.find('current-comment-page') + 23
    b = html.find(']', a)

    return html[a:b]
Esempio n. 3
0
def main(user_input):
    userinput = "stuff"
    from model import model
    from nltk.tokenize import sent_tokenize
    from get_article_encodings import get_article_encodings
    from cluster_article_encodings import cluster_article_encodings
    from get_url import get_url
    from news_summarizer import news_summarizer
    #from plot_clusters import plot_clusters

    url = get_url(user_input)
    article = news_summarizer(url)
    article_sentences = sent_tokenize(article)

    article_sentences
    encoding_model, encoding_tokenizer = model()
    article_encodings = get_article_encodings(article_sentences,
                                              encoding_tokenizer,
                                              encoding_model)
    kmeans, ymeans, summary = cluster_article_encodings(article_sentences,
                                                        article_encodings,
                                                        n=3)

    # Will maybe do something with the neat cluster plot later once we get this all working

    return summary
Esempio n. 4
0
 def __init__(self, cookie_file_name, proxy_pool, get_batch_size,
              sleep_seconds_between_batch):
     self.client = ZhihuClient(cookie_file_name)
     self.session = self.client._session
     self.proxy_pool = proxy_pool
     self.get_url = get_url.get_url(self.proxy_pool, self.session)
     self.get_batch_size = get_batch_size
     self.sleep_seconds_between_batch = sleep_seconds_between_batch
Esempio n. 5
0
def method_name(url):
    html = get_url(url)
    list = etree.HTML(html)
    dishes_system_list = list.xpath(
        '//div[@class="list_s2"]/h1[@class="list_title"]/text()')[0]
    # 正则(.+?)菜
    dishes_system_match = re.match(r'(.+?)菜', str(dishes_system_list))
    dishes_system = dishes_system_match.group()
    return dishes_system
def save_image(folder, img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename, 'wb') as f:
            try:
                img = gu.get_url(each)
                f.write(img)
            except:
                pass
Esempio n. 7
0
    def website(self) -> str:
        if self._website:
            return self._website
        else:
            try:
                issues_url = get_url(self.name, self.date.year,
                                     self.date.month)
                self._website = issues_url
            except ValueError:
                self._website = self.journal_website

        return self._website
 def __init__(self, seeds, logger, thread_num=5, max_depth=9, ranks=None, index=None):
     self.init_seeds_num = len(seeds)
     self.tocrawl = {}
     for seed in seeds:
         self.tocrawl[seed] = 0  # {url: current_depth, ...}
     self.crawled = {}           # {url1: None, url2: None, ...}
     self.max_depth = max_depth  # traversal depth
     self.logger = logger
     self.ranks = ranks
     self.down_url = get_url.get_url(logger)
     self.indexing = indexing.indexing()
     if index: self.indexing.index.update(index)
     self.threadpool = thread_pool.thread_pool(thread_num)
     self.lock = threading.Lock()
Esempio n. 9
0
def analyze(url):
    # 做出判断是否为重定向的地址
    # 然后拿到详情页的数据
    html = get_url(url)
    list = etree.HTML(html)
    # 菜名
    dish_name = \
        list.xpath('//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/h1/text()')[0]
    # 浏览数
    browse_the_number_list = list.xpath(
        '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/span[@class="info1"]/text()'
    )[1]
    # 正则匹配
    # 先去掉空格和特殊字符
    browse_the_number_sub = re.sub('\s+|·', '', browse_the_number_list).strip()
    # 匹配浏览数
    browse_the_number_match = re.findall(r'(?<=藏).+?(?=浏)',
                                         str(browse_the_number_sub))
    browse_the_number = browse_the_number_match[0]
    # 工艺
    # craft = list.xpath(
    #     '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[1]/strong/text()')[
    #     0]
    # 口味
    # taste = list.xpath(
    #     '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[2]/strong/text()')[
    #     0]
    # 时间
    # time = list.xpath(
    #     '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[3]/strong/text()')[
    #     0]
    # 难度
    # difficulty = list.xpath(
    #     '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[4]/strong/text()')[
    #     0]
    # 主料
    main_ingredient_list = list.xpath(
        '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="recipe_ingredientsw"]/div[1]/div[2]/strong/a/text()'
    )
    main_ingredient = ','.join(main_ingredient_list)
    # 辅料
    ingredient_list = list.xpath(
        '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="recipe_ingredientsw"]/div[2]/div[2]/strong/a/text()'
    )
    ingredient = ','.join(ingredient_list)
    return dish_name, browse_the_number, main_ingredient, ingredient
def find_image(url):
    html = gu.get_url(url).decode('utf-8')
    img_addrs = []

    a = html.find('img src=')

    while a != -1:
        b = html.find('.jpg', a, a + 255)

        if b != -1:
            htmln = 'http:' + str(html[a + 9:b + 4])
            img_addrs.append(htmln)
        else:
            b = a + 9
        a = html.find('img src=', b)

    return img_addrs
Esempio n. 11
0
def get_band_url(letter='A', start=0, length=500):
    """Gets the listings displayed as alphabetical tables on M-A for input
    `letter`, starting at `start` and ending at `start` + `length`.
    Returns a `Response` object. Data can be accessed by calling the `json()`
    method of the returned `Response` object."""

    band_url = 'http://www.metal-archives.com/browse/ajax-letter/json/1/l/' + letter

    band_payload = {
        'sEcho': 0,  # if not set, response text is not valid JSON
        'iDisplayStart': start,  # set start index of band names returned
        'iDisplayLength': length
    }  # only response lengths of 500 work
    if start == 0:
        print('Current letter = ', letter)

    r = gu.get_url(band_url, payload=band_payload)
    return r
Esempio n. 12
0
def main():
    from selenium import webdriver
    pdf = get_pdf()
    print(pdf['URL'])
    download_pdf(pdf['URL'])
    sleep(3)
    url = get_url(pdf['file_name'])
    # url = get_url('AGEN_23_SAS_31-07_2serie.pdf')
    zoom = url['zoom']
    id = url['id']
    password = url['password']
    # open zoom for mi self
    driver = webdriver.Chrome(
        executable_path=r'E:\coding\python\chromedriver.exe')
    driver.get(zoom)
    # * Run JavaScript code to send msg
    os.system(
        f"node E:\coding\other\class-url-automation\src\send_text_wpp\index.js {zoom} {pdf['URL']} {id} {password}"
    )
Esempio n. 13
0
def get_review_url(date="2019-01", start=0, length=200):
    """Gets the review listings displayed as alphabetical tables on M-A for
    input `letter`, starting at `start` and ending at `start` + `length`.
    Returns a `Response` object. Data can be accessed by calling the `json()`
    method of the returned `Response` object."""

    review_url = '/review/ajax-list-browse/by/date/selection/' + date + '/json/1'

    review_payload = {
        'sEcho': 1,
        'iColumns': 7,
        'iDisplayStart': start,
        'iDisplayLength': length
    }
    if start == 0:
        print('Current month = ', date)

    r = gu.get_url(review_url, payload=review_payload)

    return r
Esempio n. 14
0
    def on_data(self, status):
        # Defines the variables we need to use
        tweet = json.loads(status)
        tweet_username = tweet["user"]["screen_name"]
        parent_tweet_id = tweet["in_reply_to_status_id"]

        # Determines if the tweet was a reply from a news article or a stand alone mention
        if parent_tweet_id is not None:
            # Grabs the url from the parent tweet and summarizes it
            url = get_url(tc.BEARER_TOKEN, parent_tweet_id)
            summary = summry(url)

            # Split the summary into multiple tweets (in case it's longer than 280 characters)
            # A tweet can be at most 280 characters. We have to consider the length of the mention too.
            # (+ 2 to account for the @ symbol and the space after the username)
            summary_split = []
            split_size = 280 - (len(tweet_username) + 2)
            for index in range(0, len(summary), split_size):
                summary_split.append(
                    summary[index:min(index + split_size, len(summary))])

            # For each split, tweet it as a reply to the previous tweet, creating a thread
            bot_twitter_id = "1329945307996594177"
            tweet_reply_to = tweet["id"]
            for summary_section in summary_split:
                api.update_status(f"@{tweet_username} {summary_section}",
                                  in_reply_to_status_id=tweet_reply_to)
                # Grabs the id of the tweet just tweeted and assigns it to be the next reply_id
                tweet = api.user_timeline(id=bot_twitter_id, count=1)[0]
                tweet = tweet._json
                tweet_reply_to = tweet["id"]

        else:
            # Replies to the tweet with help instructions
            api.update_status(
                f"Hi @{tweet_username} ! I see you've mentioned me but without a news article for me to summarize. UwU. "
                f"Mention me when replying to a news article, yeah? <3 OwO",
                in_reply_to_status_id=tweet["id"])
Esempio n. 15
0
# -*-coding:utf-8-*-
'''
Created on 2015年8月30日

@author: yx
'''
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
import multiprocessing
from get_url import get_url
import time

if __name__ == '__main__':
    get_url()
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl('comment_scrapy')
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
    print "get once!!"

    print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M",
                                                  time.localtime())
Esempio n. 16
0
from get_url import get_url
from get_web import get_web
import os

get_web('http://image.baidu.com/', '/tmp/baidu.html')
url = r'http://[-.\w/]+.(jpg|png|jpeg|gif)'
urls = get_url(url, '/tmp/baidu.html')
img_dir = '/tmp/baidu/'
if not os.path.exists(img_dir):
    os.mkdir(img_dir)

for url in urls:
    fname = os.path.join(img_dir, url.split('/')[-1])
    try:
        get_web(url, fname)
    except:
        pass
Esempio n. 17
0
# -*-coding:utf-8-*-
'''
Created on 2015年8月30日

@author: yx
'''
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
import multiprocessing
from get_url import get_url
import time

if __name__ == '__main__':
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl('comment_scrapy')
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
    print "get once!!"
    get_url()
    print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M", time.localtime())
Esempio n. 18
0
def test_get_url_1():
    test_url = "https://academic.oup.com/qje/issue/130/2"
    assert get_url('QJE', 2015, 5) == test_url
Esempio n. 19
0
import os
from get_url import get_url
from getweb import get_web

get_web('http://www.tedu.cn/', '/tmp/tedu.html')
img_url = r'http://[.\w/-]+\.(jpg|png|jpeg|gif)'
urls = get_url(img_url, '/tmp/tedu.html')
img_dir = '/tmp/images'

if not os.path.exists(img_dir):
    os.mkdir(img_dir)

for url in urls:
    fname = os.path.join(img_dir, url.split('/')[-1])
    try:
        get_web(url, fname)
    except:
        pass
Esempio n. 20
0
def test_get_url_4():
    test_url = ("https://www.sciencedirect.com/journal/"
                "journal-of-urban-economics/vol/88/issue/0")
    assert get_url('JUE', 2015, 7) == test_url
Esempio n. 21
0
def test_get_url_5():
    with pytest.raises(ValueError):
        get_url('JEEM', 2015, 2)
Esempio n. 22
0
def test_get_url_3():
    with pytest.raises(ValueError):
        get_url('JPE', 2015, 3)
Esempio n. 23
0
def test_get_url_2():
    with pytest.raises(ValueError):
        get_url('QJE', 1800, 5)
Esempio n. 24
0
        pat = re.compile(r"^#EXT-X-STREAM-INF.+BANDWIDTH=(?P<bandwidth>\d+).*(?:\n|\r\n?)(?P<stream>.+)", re.MULTILINE)
        dst_fname = {
            "m3u8": "vlc.m3u8",
            "html": "tv_bl.html",
            "xspf": "vlc.xspf"
        }[fmt]

        req_clns = ["ts_port"]
        for cnxt in rewrite_channels(dst_fname, req_clns, fmt=fmt):
            # :TRICKY: своей колонки нет
            hls_idx = cnxt.clns["ts_port"] + 1
            url = cnxt.row[hls_idx]
            
            if url.startswith("http://"):
                print(name, url)
                try:
                    with contextlib.closing(get_url.get_url(url)) as pf:
                        txt = pf.read()
                except get_url.URLError:
                    pass
                else:
                    max_bw, max_url = 0, None
                    for m in pat.finditer(txt):
                        bw = int(m.group('bandwidth'))
                        if not max_bw or max_bw < bw:
                            max_bw = bw
                            max_url = m.group('stream')
                    assert max_url
                    max_url = o_p.join(os.path.dirname(url), max_url)
                    write_channel(cnxt, max_url)
Esempio n. 25
0
            # If the response fails, r.json() will raise an exception, so retry
            except JSONDecodeError:
                print('JSONDecodeError on attempt ', attempt, ' of 10.')
                print('Retrying...')
                continue
            break

        # Fetch review title and content
        review_titles = []
        reviews = []
        print('Fetching review content...')
        for n, link in enumerate(df['ReviewLink']):
            time.sleep(3)
            print('Review #', n + 1)
            linksoup = BeautifulSoup(link, 'html.parser')
            review_page = gu.get_url(linksoup.a['href'])
            review_page.encoding = encoding
            review_soup = BeautifulSoup(review_page.text, 'html.parser')
            review_title = review_soup.find_all('h3')[0].text.strip()[:-6]
            review_titles.append(review_title)
            review = review_soup.find_all('div',
                                          {'class': 'reviewContent'})[0].text
            reviews.append(review)

        # Store review data & save to disk
        df['ReviewTitle'] = review_titles
        df['ReviewContent'] = reviews
        df['DateScraped'] = today_date
        f_name = 'data/MA-reviews_' + date + '_' + '%03d' % i + '.csv'
        print('Writing chunk to csv file:', f_name)
        df.to_csv(f_name)
Esempio n. 26
0
            re.MULTILINE)
        dst_fname = {
            "m3u8": "vlc.m3u8",
            "html": "tv_bl.html",
            "xspf": "vlc.xspf"
        }[fmt]

        req_clns = ["ts_port"]
        for cnxt in rewrite_channels(dst_fname, req_clns, fmt=fmt):
            # :TRICKY: своей колонки нет
            hls_idx = cnxt.clns["ts_port"] + 1
            url = cnxt.row[hls_idx]

            if url.startswith("http://"):
                print(name, url)
                try:
                    with contextlib.closing(get_url.get_url(url)) as pf:
                        txt = pf.read()
                except get_url.URLError:
                    pass
                else:
                    max_bw, max_url = 0, None
                    for m in pat.finditer(txt):
                        bw = int(m.group('bandwidth'))
                        if not max_bw or max_bw < bw:
                            max_bw = bw
                            max_url = m.group('stream')
                    assert max_url
                    max_url = o_p.join(os.path.dirname(url), max_url)
                    write_channel(cnxt, max_url)