def index(): return render_template( 'index.html', variable1=get_url()[0], variable2=get_url()[1], variable3=get_url()[2], )
def get_page(url): html = gu.get_url(url).decode('utf-8') a = html.find('current-comment-page') + 23 b = html.find(']', a) return html[a:b]
def main(user_input): userinput = "stuff" from model import model from nltk.tokenize import sent_tokenize from get_article_encodings import get_article_encodings from cluster_article_encodings import cluster_article_encodings from get_url import get_url from news_summarizer import news_summarizer #from plot_clusters import plot_clusters url = get_url(user_input) article = news_summarizer(url) article_sentences = sent_tokenize(article) article_sentences encoding_model, encoding_tokenizer = model() article_encodings = get_article_encodings(article_sentences, encoding_tokenizer, encoding_model) kmeans, ymeans, summary = cluster_article_encodings(article_sentences, article_encodings, n=3) # Will maybe do something with the neat cluster plot later once we get this all working return summary
def __init__(self, cookie_file_name, proxy_pool, get_batch_size, sleep_seconds_between_batch): self.client = ZhihuClient(cookie_file_name) self.session = self.client._session self.proxy_pool = proxy_pool self.get_url = get_url.get_url(self.proxy_pool, self.session) self.get_batch_size = get_batch_size self.sleep_seconds_between_batch = sleep_seconds_between_batch
def method_name(url): html = get_url(url) list = etree.HTML(html) dishes_system_list = list.xpath( '//div[@class="list_s2"]/h1[@class="list_title"]/text()')[0] # 正则(.+?)菜 dishes_system_match = re.match(r'(.+?)菜', str(dishes_system_list)) dishes_system = dishes_system_match.group() return dishes_system
def save_image(folder, img_addrs): for each in img_addrs: filename = each.split('/')[-1] with open(filename, 'wb') as f: try: img = gu.get_url(each) f.write(img) except: pass
def website(self) -> str: if self._website: return self._website else: try: issues_url = get_url(self.name, self.date.year, self.date.month) self._website = issues_url except ValueError: self._website = self.journal_website return self._website
def __init__(self, seeds, logger, thread_num=5, max_depth=9, ranks=None, index=None): self.init_seeds_num = len(seeds) self.tocrawl = {} for seed in seeds: self.tocrawl[seed] = 0 # {url: current_depth, ...} self.crawled = {} # {url1: None, url2: None, ...} self.max_depth = max_depth # traversal depth self.logger = logger self.ranks = ranks self.down_url = get_url.get_url(logger) self.indexing = indexing.indexing() if index: self.indexing.index.update(index) self.threadpool = thread_pool.thread_pool(thread_num) self.lock = threading.Lock()
def analyze(url): # 做出判断是否为重定向的地址 # 然后拿到详情页的数据 html = get_url(url) list = etree.HTML(html) # 菜名 dish_name = \ list.xpath('//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/h1/text()')[0] # 浏览数 browse_the_number_list = list.xpath( '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/span[@class="info1"]/text()' )[1] # 正则匹配 # 先去掉空格和特殊字符 browse_the_number_sub = re.sub('\s+|·', '', browse_the_number_list).strip() # 匹配浏览数 browse_the_number_match = re.findall(r'(?<=藏).+?(?=浏)', str(browse_the_number_sub)) browse_the_number = browse_the_number_match[0] # 工艺 # craft = list.xpath( # '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[1]/strong/text()')[ # 0] # 口味 # taste = list.xpath( # '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[2]/strong/text()')[ # 0] # 时间 # time = list.xpath( # '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[3]/strong/text()')[ # 0] # 难度 # difficulty = list.xpath( # '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="info2"]/div[4]/strong/text()')[ # 0] # 主料 main_ingredient_list = list.xpath( '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="recipe_ingredientsw"]/div[1]/div[2]/strong/a/text()' ) main_ingredient = ','.join(main_ingredient_list) # 辅料 ingredient_list = list.xpath( '//*[@id="app"]/div[@class="recipe_header"]/div[1]/div[@class="recipe_header_info"]/div[@class="recipe_ingredientsw"]/div[2]/div[2]/strong/a/text()' ) ingredient = ','.join(ingredient_list) return dish_name, browse_the_number, main_ingredient, ingredient
def find_image(url): html = gu.get_url(url).decode('utf-8') img_addrs = [] a = html.find('img src=') while a != -1: b = html.find('.jpg', a, a + 255) if b != -1: htmln = 'http:' + str(html[a + 9:b + 4]) img_addrs.append(htmln) else: b = a + 9 a = html.find('img src=', b) return img_addrs
def get_band_url(letter='A', start=0, length=500): """Gets the listings displayed as alphabetical tables on M-A for input `letter`, starting at `start` and ending at `start` + `length`. Returns a `Response` object. Data can be accessed by calling the `json()` method of the returned `Response` object.""" band_url = 'http://www.metal-archives.com/browse/ajax-letter/json/1/l/' + letter band_payload = { 'sEcho': 0, # if not set, response text is not valid JSON 'iDisplayStart': start, # set start index of band names returned 'iDisplayLength': length } # only response lengths of 500 work if start == 0: print('Current letter = ', letter) r = gu.get_url(band_url, payload=band_payload) return r
def main(): from selenium import webdriver pdf = get_pdf() print(pdf['URL']) download_pdf(pdf['URL']) sleep(3) url = get_url(pdf['file_name']) # url = get_url('AGEN_23_SAS_31-07_2serie.pdf') zoom = url['zoom'] id = url['id'] password = url['password'] # open zoom for mi self driver = webdriver.Chrome( executable_path=r'E:\coding\python\chromedriver.exe') driver.get(zoom) # * Run JavaScript code to send msg os.system( f"node E:\coding\other\class-url-automation\src\send_text_wpp\index.js {zoom} {pdf['URL']} {id} {password}" )
def get_review_url(date="2019-01", start=0, length=200): """Gets the review listings displayed as alphabetical tables on M-A for input `letter`, starting at `start` and ending at `start` + `length`. Returns a `Response` object. Data can be accessed by calling the `json()` method of the returned `Response` object.""" review_url = '/review/ajax-list-browse/by/date/selection/' + date + '/json/1' review_payload = { 'sEcho': 1, 'iColumns': 7, 'iDisplayStart': start, 'iDisplayLength': length } if start == 0: print('Current month = ', date) r = gu.get_url(review_url, payload=review_payload) return r
def on_data(self, status): # Defines the variables we need to use tweet = json.loads(status) tweet_username = tweet["user"]["screen_name"] parent_tweet_id = tweet["in_reply_to_status_id"] # Determines if the tweet was a reply from a news article or a stand alone mention if parent_tweet_id is not None: # Grabs the url from the parent tweet and summarizes it url = get_url(tc.BEARER_TOKEN, parent_tweet_id) summary = summry(url) # Split the summary into multiple tweets (in case it's longer than 280 characters) # A tweet can be at most 280 characters. We have to consider the length of the mention too. # (+ 2 to account for the @ symbol and the space after the username) summary_split = [] split_size = 280 - (len(tweet_username) + 2) for index in range(0, len(summary), split_size): summary_split.append( summary[index:min(index + split_size, len(summary))]) # For each split, tweet it as a reply to the previous tweet, creating a thread bot_twitter_id = "1329945307996594177" tweet_reply_to = tweet["id"] for summary_section in summary_split: api.update_status(f"@{tweet_username} {summary_section}", in_reply_to_status_id=tweet_reply_to) # Grabs the id of the tweet just tweeted and assigns it to be the next reply_id tweet = api.user_timeline(id=bot_twitter_id, count=1)[0] tweet = tweet._json tweet_reply_to = tweet["id"] else: # Replies to the tweet with help instructions api.update_status( f"Hi @{tweet_username} ! I see you've mentioned me but without a news article for me to summarize. UwU. " f"Mention me when replying to a news article, yeah? <3 OwO", in_reply_to_status_id=tweet["id"])
# -*-coding:utf-8-*- ''' Created on 2015年8月30日 @author: yx ''' from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess import multiprocessing from get_url import get_url import time if __name__ == '__main__': get_url() runner = CrawlerRunner(get_project_settings()) d = runner.crawl('comment_scrapy') d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished print "get once!!" print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M", time.localtime())
from get_url import get_url from get_web import get_web import os get_web('http://image.baidu.com/', '/tmp/baidu.html') url = r'http://[-.\w/]+.(jpg|png|jpeg|gif)' urls = get_url(url, '/tmp/baidu.html') img_dir = '/tmp/baidu/' if not os.path.exists(img_dir): os.mkdir(img_dir) for url in urls: fname = os.path.join(img_dir, url.split('/')[-1]) try: get_web(url, fname) except: pass
# -*-coding:utf-8-*- ''' Created on 2015年8月30日 @author: yx ''' from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess import multiprocessing from get_url import get_url import time if __name__ == '__main__': runner = CrawlerRunner(get_project_settings()) d = runner.crawl('comment_scrapy') d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished print "get once!!" get_url() print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M", time.localtime())
def test_get_url_1(): test_url = "https://academic.oup.com/qje/issue/130/2" assert get_url('QJE', 2015, 5) == test_url
import os from get_url import get_url from getweb import get_web get_web('http://www.tedu.cn/', '/tmp/tedu.html') img_url = r'http://[.\w/-]+\.(jpg|png|jpeg|gif)' urls = get_url(img_url, '/tmp/tedu.html') img_dir = '/tmp/images' if not os.path.exists(img_dir): os.mkdir(img_dir) for url in urls: fname = os.path.join(img_dir, url.split('/')[-1]) try: get_web(url, fname) except: pass
def test_get_url_4(): test_url = ("https://www.sciencedirect.com/journal/" "journal-of-urban-economics/vol/88/issue/0") assert get_url('JUE', 2015, 7) == test_url
def test_get_url_5(): with pytest.raises(ValueError): get_url('JEEM', 2015, 2)
def test_get_url_3(): with pytest.raises(ValueError): get_url('JPE', 2015, 3)
def test_get_url_2(): with pytest.raises(ValueError): get_url('QJE', 1800, 5)
pat = re.compile(r"^#EXT-X-STREAM-INF.+BANDWIDTH=(?P<bandwidth>\d+).*(?:\n|\r\n?)(?P<stream>.+)", re.MULTILINE) dst_fname = { "m3u8": "vlc.m3u8", "html": "tv_bl.html", "xspf": "vlc.xspf" }[fmt] req_clns = ["ts_port"] for cnxt in rewrite_channels(dst_fname, req_clns, fmt=fmt): # :TRICKY: своей колонки нет hls_idx = cnxt.clns["ts_port"] + 1 url = cnxt.row[hls_idx] if url.startswith("http://"): print(name, url) try: with contextlib.closing(get_url.get_url(url)) as pf: txt = pf.read() except get_url.URLError: pass else: max_bw, max_url = 0, None for m in pat.finditer(txt): bw = int(m.group('bandwidth')) if not max_bw or max_bw < bw: max_bw = bw max_url = m.group('stream') assert max_url max_url = o_p.join(os.path.dirname(url), max_url) write_channel(cnxt, max_url)
# If the response fails, r.json() will raise an exception, so retry except JSONDecodeError: print('JSONDecodeError on attempt ', attempt, ' of 10.') print('Retrying...') continue break # Fetch review title and content review_titles = [] reviews = [] print('Fetching review content...') for n, link in enumerate(df['ReviewLink']): time.sleep(3) print('Review #', n + 1) linksoup = BeautifulSoup(link, 'html.parser') review_page = gu.get_url(linksoup.a['href']) review_page.encoding = encoding review_soup = BeautifulSoup(review_page.text, 'html.parser') review_title = review_soup.find_all('h3')[0].text.strip()[:-6] review_titles.append(review_title) review = review_soup.find_all('div', {'class': 'reviewContent'})[0].text reviews.append(review) # Store review data & save to disk df['ReviewTitle'] = review_titles df['ReviewContent'] = reviews df['DateScraped'] = today_date f_name = 'data/MA-reviews_' + date + '_' + '%03d' % i + '.csv' print('Writing chunk to csv file:', f_name) df.to_csv(f_name)
re.MULTILINE) dst_fname = { "m3u8": "vlc.m3u8", "html": "tv_bl.html", "xspf": "vlc.xspf" }[fmt] req_clns = ["ts_port"] for cnxt in rewrite_channels(dst_fname, req_clns, fmt=fmt): # :TRICKY: своей колонки нет hls_idx = cnxt.clns["ts_port"] + 1 url = cnxt.row[hls_idx] if url.startswith("http://"): print(name, url) try: with contextlib.closing(get_url.get_url(url)) as pf: txt = pf.read() except get_url.URLError: pass else: max_bw, max_url = 0, None for m in pat.finditer(txt): bw = int(m.group('bandwidth')) if not max_bw or max_bw < bw: max_bw = bw max_url = m.group('stream') assert max_url max_url = o_p.join(os.path.dirname(url), max_url) write_channel(cnxt, max_url)