def run(md_file, name, email, no_contents, template_dir ): title, summary, body = get_content.get_content(md_file) r = False # remove title from the body string filename = body.replace(title, '') if md_file[-3:].lower() == 'rmd': warning('RMD file detected. Processing...') r = True filename = process_r_markdown(md_file) # the autoescape should really be set to True, but if I set it to True, html doesn't render... env = Environment(loader = FileSystemLoader(template_dir), autoescape = False ) template = env.get_template('report_template.html') output_from_parsed_template = template.render(content=get_html.render_markdown_content(filename), title=title, summary=summary, author_name=name, author_email=email, no_contents=no_contents, r=r ) return output_from_parsed_template
def crawl_tamsu( num=randint(6621, 12128), path='data/tamsu/', header_name='tamsu_'): if not os.path.exists(path): os.makedirs(path) urls = get_urls_new(VNEXPRESS_TAMSU, num) for url in urls: new = get_content(url) if new != '': print("Write url: {}".format(url)) save_content(path, header_name, new)
def crawl_thegioi( num=randint(6621, 12128), path='data/thegioi/', header_name='thegioi_'): urls = get_urls_new(VNEXPRESS_THEGIOI, num) if not os.path.exists(path): os.makedirs(path) for url in urls: new = get_content(url) if new != '': print("Write url: {}".format(url)) save_content(path, header_name, new)
def crawl_giaoduc( num=randint(6621, 12128), path='data/giaoduc/', header_name='giaoduc_'): if not os.path.exists(path): os.makedirs(path) urls = get_urls_new(VNEXPRESS_GIAODUC, num) for url in urls: new = get_content(url) if new != '': print("Write url: {}".format(url)) save_content(path, header_name, new)
def crawl_phapluat( num=randint(6621, 12128), path='data/phapluat/', header_name='phapluat_'): if not os.path.exists(path): os.makedirs(path) urls = get_urls_new(VNEXPRESS_PHAPLUAT, num) for url in urls: new = get_content(url) if new != '': print("Write url: {}".format(url)) save_content(path, header_name, new)
def crawl_giaitri( num=randint(6621, 12128), path='data/giaitri/', header_name='giaitri_'): urls = get_urls_new(VNEXPRESS_GIAITRI, num) import os if not os.path.exists(path): os.makedirs(path) for url in urls: new = get_content(url) if new != '': print("Write url: {}".format(url)) save_content(path, header_name, new)
def main(): print(""" Welcome, you are using the ContentGetter. Though ContentGetter uses Python Cryptography to encrypt your saved information, please do not use it to store important passwords on sites such as Bank , Amazon, Facebook, or Instagram where Hackers could be a potential threat. It's fine to use it to store information with your CV, if you are copying and pasting all the time to fill out only application, this could make your life easier! """) print("Press Y/y to continue; press any key to quit") user_input = input("Are you going to use the ContentGetter?> ") if user_input.lower() == 'y': # continue path = get_path() file_name = get_file_name() while True: user_want = input("Would you like to (add), (get), (del) content?> (Press any key to quit):> ") if user_want.lower() == "add": # add account and its content to the file account = get_account() content = enter_content() content_key_writer(path, file_name, account, content) elif user_want.lower() == "get": # get the content from the account in the file account = get_account() get_content(path, file_name, account) elif user_want.lower() == 'del': # delete the account from file account = get_account() del_account(path, file_name, account) else: break else: print("Thank you for using ContentGetter.")
def get_table_droppingodds(): referer = "https://www.google.com" url = "https://www.arbworld.net/en/droppingodds" r = get_content.get_content(url, referer) soup = BeautifulSoup(r.text, 'lxml') events_file = filein2 table1 = soup.find("table", {'class': 'grid'}) for cof_list1 in table1.select("tr", {'class': 'heading'}): header_cof = [] for cof1 in cof_list1.select("td"): header_cof.append(cof1.get_text(separator=' ')) with open(events_file, 'w') as csvfile: writercol = csv.DictWriter(csvfile, fieldnames=header_cof[3:14]) writercol.writeheader() table = soup.find('table', {'id': 'matches'}) for cof_list in table.select("tr", {'class': 'belowHeader'}): cof_list_list = [] for cof in cof_list.select("td"): cof_list_list.append(cof.get_text(separator=' ')) if len(cof_list_list[2:13]) == 0: continue if float(cof_list_list[6].split()[0]) - float(cof_list_list[6].split()[1]) >= drop: with open(events_file, 'a') as csvfile: writerrow = csv.writer(csvfile) row = cof_list_list[2], cof_list_list[3], cof_list_list[4], cof_list_list[5], cof_list_list[ 6], "", "", "", "", cof_list_list[11], cof_list_list[12] writerrow.writerow(row) if float(cof_list_list[8].split()[0]) - float(cof_list_list[8].split()[1]) >= drop: with open(events_file, 'a') as csvfile: writerrow = csv.writer(csvfile) row = cof_list_list[2], cof_list_list[3], cof_list_list[4], cof_list_list[5], "", "", cof_list_list[ 8], "", "", cof_list_list[11], cof_list_list[12] writerrow.writerow(row) if float(cof_list_list[10].split()[0]) - float(cof_list_list[10].split()[1]) >= drop: with open(events_file, 'a') as csvfile: writerrow = csv.writer(csvfile) row = cof_list_list[2], cof_list_list[3], cof_list_list[4], cof_list_list[5], "", "", "", "", \ cof_list_list[10], cof_list_list[11], cof_list_list[12] writerrow.writerow(row)
def get_table_1x2(): referer = "https://www.google.com" url = "https://www.arbworld.net/en/moneyway/mw-1-x-2" r = get_content.get_content(url, referer) soup = BeautifulSoup(r.text, 'lxml') events_file = filein1 table1 = soup.find("table", {'class': 'grid'}) for cof_list1 in table1.select("tr", {'class': 'heading'}): header_cof = [] for cof1 in cof_list1.select("td"): header_cof.append(cof1.get_text(separator=' ')) with open(events_file, 'w') as csvfile: writercol = csv.DictWriter(csvfile, fieldnames=header_cof[3:14]) writercol.writeheader() table = soup.find('table', {'id': 'matches'}) for cof_list in table.select("tr", {'class': 'belowHeader'}): cof_list_list = [] for cof in cof_list.select("td"): cof_list_list.append(cof.get_text(separator=' ')) if len(cof_list_list[2:13]) == 0: continue if int(cof_list_list[12].replace('€', '').replace(' ', '')) > vol: if float(cof_list_list[9].replace('€', '').replace('%', '').split()[0]) > percent: if float(cof_list_list[6]) > cof_cof: with open(events_file, 'a') as csvfile: writerrow = csv.writer(csvfile) writerrow.writerow(cof_list_list[2:13]) if float(cof_list_list[10].replace('€', '').replace('%', '').split()[0]) > percent: if float(cof_list_list[7]) > cof_cof: with open(events_file, 'a') as csvfile: writerrow = csv.writer(csvfile) writerrow.writerow(cof_list_list[2:13]) if float(cof_list_list[11].replace('€', '').replace('%', '').split()[0]) > percent: if float(cof_list_list[8]) > cof_cof: with open(events_file, 'a') as csvfile: writerrow = csv.writer(csvfile) writerrow.writerow(cof_list_list[2:13])
def amin(start_url='http://pstu.ru'): content = get_content(start_url) tokens = get_tokens(content) print(3333, tokens) exit(0) for token in tokens: if is_xlsx_link(token): xlsx_content = xlsx_get_content(link_from_token(token)) xlsx_tokens = tokenize_xlsx(xlsx_content) for cell in xlsx_content: time, lecture, teacher, room = parse(cell) rezult.append(link_from_token(token), xlsx_tokens, time, lecture, teacher, room) elif is_normal_link(link_from_token(token)): print(2, token) # amin(link_from_token(token)) #rezult.append(start_url, tokens) rezult.append(start_url) rezult.append(tokens) return rezult
from flask import Flask, render_template, request import get_content import url_renderer import extract_content import datetime import extract_form content = get_content.get_content(url_renderer.get_current_url()) app = Flask(__name__) @app.route('/') def presidency(): start_time = datetime.datetime(2000, 1, 1, 19) header = extract_content.header(content) sections = [extract_content.section1(content), extract_content.section2(content), extract_content.section3(content), extract_content.section4(content)] for section in sections: for item in section.items: item.time = start_time.time() start_time += datetime.timedelta(minutes=item.duration) return render_template("sections.html", sections=sections, header=header, start_time=start_time) @app.route('/', methods=['POST']) def form_post(): tree, section_title = extract_form.create_tree(request.form)
import get_content from bs4 import BeautifulSoup import csv vol=700 percent=90 cof_cof=1.6 referer = "https://www.google.com" url = "https://www.arbworld.net/en/moneyway/mw-overunder" r = get_content.get_content(url,referer) soup = BeautifulSoup(r.text, 'lxml') events_file = "arb_over_under" + ".csv" table1 = soup.find("table", {'class': 'grid'}) for cof_list1 in table1.select("tr", {'class': 'heading'}): header_cof = [] for cof1 in cof_list1.select("td"): header_cof.append(cof1.get_text(separator=' ')) with open(events_file, 'w') as csvfile: writercol = csv.DictWriter(csvfile, fieldnames=header_cof[3:13]) writercol.writeheader() table = soup.find('table', {'id': 'matches' }) for cof_list in table.select("tr", {'class': 'belowHeader'}): cof_list_list = []
import get_content from bs4 import BeautifulSoup import json url = "https://www.parimatch.com" r = get_content.get_content(url, url).content data = {} soup = BeautifulSoup(r, 'lxml') menu = soup.find('div', {'id': 'lobbyLeftHolder'}) list_group_of_sports = menu.find('div', {'id': 'lobbySportsHolder'}) for list_group in list_group_of_sports.find_all('ul', {'class': 'groups'}): group_name = list_group.findPrevious('a').get_text() data[group_name] = [] for list_group_events in list_group.find_all('li'): for item in list_group_events.select("a"): name_group_events = item.get_text() id_group_events = item['hd'] href_group_events = item['href'] data[group_name].append({ 'name': name_group_events, 'id': id_group_events, 'href': href_group_events }) wfile = open('group_events_list.json', mode='w', encoding='UTF-8') json.dump(data, wfile, indent=4, ensure_ascii=False) wfile.close()
# -*- coding: utf-8 -*- import re import jieba.analyse import get_content for content, active_id in get_content.get_content(): sentence = re.sub("<.*?>", "", content) ret = jieba.analyse.extract_tags(sentence, topK=20, withWeight=True, allowPOS=()) active_id = active_id.split("/")[-1] key_words = "" for k, v in ret: if v > 0.05: key_words = key_words + k + "," print(key_words, active_id) break
from get_links import get_links from get_content import get_html, get_date, get_content, save from tqdm import tqdm for i in tqdm(range(1, 570)): url = f'https://vnexpress.net/kinh-doanh/chung-khoan-p{i}' try: links = get_links(url) for link in links: try: soup = get_html(link) content = get_content(soup) date, year = get_date(soup) save( link.replace("https://vnexpress.net/", "").replace(".html", ".txt"), content, year, date, "vnexpress-data") except: print("error:", link) except: print("error:", url)
def find_field(self): ''' Finding out which unit field is going to be patched inside EC scope. ''' if VERBOSE: print( "Into: find_field(): Finding out which unit field is going to be patched inside EC scope." ) for OR in self.OR_info: OR["field_unit"] = [] OR_path = OR["path"] + '.' + OR["name"] content = get_content.get_content(self.dsdt_splited, OR_path) for field in content.split("}")[:-1]: field = field.split('{')[1] # Remove field header store_flag = False # Is there any field that larger than 16 bits in this method? field_content_splln = field.split('\n') offset_bits = 0 # offset in bits name = '' size = 0 for item in field_content_splln: if ',' not in item: # Skip empty line continue elif "Offset" not in item: item_spl = item.split(',') name = item_spl[0].strip() size = int(item_spl[1].strip()) if size > 8 and name != '': if offset_bits / 8 - int(offset_bits / 8) != 0: print(FIELD_UNIT_OFFSET_ERR) exit(2) OR["field_unit"].append({ "name": name, "offset": int(offset_bits / 8), "size": size, "OR_path": OR_path }) store_flag = True offset_bits += size else: item = item.strip() offset = re.search(r'Offset \((.*)\)', item).group(1) offset_bits = int(offset, 16) * 8 if store_flag: # Store this OperationRegion and its units if OR["storage"] not in self.RW_method: while True: # This will generate a new R/W method name letter = random.choice( '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ') OR['RE1B'] = 'R1B' + letter OR['RECB'] = 'REB' + letter OR['ERM2'] = 'MEM' + letter OR['WE1B'] = 'W1B' + letter OR['WECB'] = 'WRB' + letter if (self.dsdt_content.find(OR['RE1B']) == -1 and OR['RE1B'] not in self.RW_method) \ and (self.dsdt_content.find(OR['RECB']) == -1 and OR['RECB'] not in self.RW_method) \ and (self.dsdt_content.find(OR['ERM2']) == -1 and OR['ERM2'] not in self.RW_method) \ and (self.dsdt_content.find(OR['WE1B']) == -1 and OR['WE1B'] not in self.RW_method) \ and (self.dsdt_content.find(OR['WECB']) == -1 and OR['WECB'] not in self.RW_method): # Loop until there is nothing has the same name as our generated one break # Add the content of R/W method to self.RW_method self.RW_method += RW_METHOD[0] + \ OR["path"] + RW_METHOD[1] +\ OR['RE1B'] + RW_METHOD[2] + \ OR['ERM2'] + RW_METHOD[3] + \ OR["storage"] + RW_METHOD[4] + \ OR['ERM2'] + RW_METHOD[5] + \ OR['RECB'] + RW_METHOD[6] + \ OR['RE1B'] + RW_METHOD[7] + \ OR['WE1B'] + RW_METHOD[8] + \ OR['ERM2'] + RW_METHOD[9] + \ OR["storage"] + RW_METHOD[10] + \ OR['ERM2'] + RW_METHOD[11] + \ OR['WECB'] + RW_METHOD[12] + \ OR['WE1B'] + RW_METHOD[13] else: # If OR['storage'] in self.RW_method and not in OR itself # Then copy the read/write method from other OR which has the same storage for _OR_ in self.OR_info: if _OR_['storage'] == OR['storage']: OR['RE1B'] = _OR_['RE1B'] OR['RECB'] = _OR_['RECB'] OR['ERM2'] = _OR_['ERM2'] OR['WE1B'] = _OR_['WE1B'] OR['WECB'] = _OR_['WECB'] break for OR in self.OR_info: if len(OR['field_unit']) < 1: self.OR_info.remove(OR) if "RECB" not in self.RW_method: print(NOT_NEED_TO_PATCH_MSG) exit(0) if VERBOSE: for OR in self.OR_info: print(OR['path'] + '.' + OR['name'] + ', ' + OR['storage']) for unit in OR['field_unit']: print(' -', unit)
import get_content import init init.init() with open(init.DEFAULT_URLS_FILE) as myfile: for url in list(myfile): get_content.get_content(url) f = open(init.DEFAULT_HTML_FILE, 'a') closed_tag = "\n</html>" f.write(closed_tag) f.close()
from get_content import get_content from send_mail import send_mail if __name__ == '__main__': data = get_content() send_mail('*****@*****.**','title',data)
from get_urls import get_urls from get_content import get_content from get_jsons import get_jsons from get_comments import getComments, write_comments if __name__ == "__main__": commentsData = [] urls = get_urls() for i, url in enumerate(urls): print('Getting:', url) # get_response(str(i + 1), url) html_content = get_content(url) json_data = get_jsons(html_content) comments = getComments(json_data) for comment in comments: comment.insert(0, 'No.' + str(i) + ' url') commentsData.append(comment) write_comments(commentsData)