def download(): db.connect() aufmacher = Aufmacher.select() aufmacher_length = len(aufmacher) for index, auf in enumerate(aufmacher): path_to_file = auf.unique_id.replace("http://xml.zeit.de/", "") xml_file_path = (Path("xml") / path_to_file).with_suffix(".xml") if xml_file_path.is_file(): continue print("{}/{}".format(index, aufmacher_length), xml_file_path) folder = xml_file_path.parent Path(folder).mkdir(parents=True, exist_ok=True) article_content = download_article(auf.unique_id) if article_content: print("writing", xml_file_path) with open(xml_file_path, "w") as xml_file: xml_file.write(article_content) else: print("error!") db.close()
def db_init(): db.connect() try: db.create_tables([Movie]) print('Creating tables...') except OperationalError: pass db.close()
def clean_up(): db.connect() aufmachers = Aufmacher\ .select(Aufmacher, TweetJob)\ .join(TweetJob, JOIN.LEFT_OUTER, on=(Aufmacher.id == TweetJob.aufmacher).alias('tweetjob'))\ .order_by(Aufmacher.created_at.desc()) for aufmacher in aufmachers: if not aufmacher.tweetjob.id: print(model_to_dict(aufmacher.tweetjob)) aufmacher.tweetjob.save() db.close()
def get_db(dict_cursor=True): mysqldb = db.get_db() # if not in a request we won't be connected if not mysqldb: mysqldb = db.connect() # return a DictCursor if dict_cursor: return mysqldb.cursor(MySQLdb.cursors.DictCursor) else: return mysqldb.cursor()
def scrape(): r = requests.get(DOWNLOAD_URL) soup = BeautifulSoup(r.text, "html.parser") teaser = soup.select(".main article") if len(teaser): teaser = teaser[0] else: return unique_id = teaser["data-unique-id"].strip().replace("https", "http") db.connect() db.create_tables([Image, Author, Aufmacher, TweetJob], safe=True) possible_duplicate = Aufmacher.select().where(Aufmacher.unique_id == unique_id) if not len(possible_duplicate): get_article_data(unique_id) db.close()
def stats(): db.connect() aufmacher_count = Aufmacher.select().count() author_count = Author.select().count() image_count = Image.select().count() print("Stats:") print("{:>5} Aufmacher".format(aufmacher_count)) print("{:>5} authors".format(author_count)) print("{:>5} images".format(image_count)) print("\nLatest:") latest_aufmacher = Aufmacher.select().order_by(Aufmacher.created_at.desc()) latest_aufmacher_string = """ since {created_at} {supertitle}: {title} {subtitle} by {author_name} """.format(**model_to_dict(latest_aufmacher[0]), author_name=latest_aufmacher[0].author.name) print(latest_aufmacher_string.strip()) db.close()
bot = telebot.TeleBot(token=config.token) ''' # using proxy in Russia apihelper.proxy = { # 'http': 'http://46.101.149.132:3128', # 'https': 'https://46.101.149.132:3128' # 'http': 'http://79.138.99.254:8080', # 'https': 'https://79.138.99.254:8080' 'http': 'http://5.148.128.44:80', 'https': 'https://5.148.128.44:80' # 'http': 'http://167.99.242.198:8080', # 'https': 'https://167.99.242.198:8080' } ''' # create tables in db db.connect() db.create_tables([User, Player, Challenge]) # create GOD if not exists try: god = User.get(User.tg_id == config.creatorID) except DoesNotExist: god = User.create(tg_id=config.creatorID, username=config.creatorUsername, name='Yury', role=Role.GOD) @MWT(timeout=5*60) def get_privilege_ids(role): logger.info("Update list of %s", role) return [user.tg_id for user in User.select().where(User.role >= role)]
def make_stats(): start_date = arrow.get("2019-10-14", "YYYY-MM-DD").datetime end_date = arrow.get("2020-10-14", "YYYY-MM-DD").datetime number_of_days = (end_date - start_date).days db.connect() aufmacher = (Aufmacher.select().where( (Aufmacher.created_at >= start_date) & (Aufmacher.created_at <= end_date)).order_by(Aufmacher.created_at)) csv_data = [] print("{} Aufmacher an {} Tagen".format(len(aufmacher), number_of_days)) print(len(aufmacher) / number_of_days, " Aufmacher pro Tag") for index, auf in enumerate(aufmacher): path_to_file = auf.unique_id.replace("http://xml.zeit.de/", "") xml_file_path = (Path("xml") / path_to_file).with_suffix(".xml") next_aufmacher = aufmacher[index + 1] if index < len(aufmacher) - 1 else None if xml_file_path.is_file(): with open(xml_file_path, "r") as xml_file: parsed_article = untangle.parse(xml_file) print(xml_file_path) try: article_data = get_article_data(auf, parsed_article.article, next_aufmacher) if article_data: csv_data.append(article_data) except AttributeError: continue with open("stats/aufmacher.csv", "w") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=csv_data[0].keys()) writer.writeheader() writer.writerows(csv_data) with open("stats/tags.csv", "w") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=list(all_tags.values())[0].keys()) writer.writeheader() writer.writerows( sorted(list(all_tags.values()), key=itemgetter("count"), reverse=True)) with open("stats/tagcloud.txt", "w") as out_file: out_file.write("\n".join(dumb_tag_list)) headlines_with_shitlist_words = [] teasers_with_shitlist_words = [] with open("shitlist.txt", "r") as shitlist_file: shitlist = [line.rstrip("\n").lower() for line in shitlist_file] print(shitlist) for auf in csv_data: if any([word in auf["teaserTitle"].lower() for word in shitlist]): headlines_with_shitlist_words.append({ "teaserTitle": auf["teaserTitle"], "url": auf["url"] }) for auf in teaser_texts: if any([word in auf["teaserText"].lower() for word in shitlist]): teasers_with_shitlist_words.append({ "teaserText": auf["teaserText"], "url": auf["url"], "words": ", ".join([ word for word in shitlist if word in auf["teaserText"].lower() ]), }) if len(headlines_with_shitlist_words): with open("stats/shitlisted_headlines.csv", "w") as out_file: writer = csv.DictWriter( out_file, fieldnames=headlines_with_shitlist_words[0].keys()) writer.writeheader() writer.writerows(headlines_with_shitlist_words) if len(teasers_with_shitlist_words): with open("stats/shitlisted_teasers.csv", "w") as out_file: writer = csv.DictWriter( out_file, fieldnames=teasers_with_shitlist_words[0].keys()) writer.writeheader() writer.writerows(teasers_with_shitlist_words)
def go(): select_from_datetime = datetime.datetime.now() - datetime.timedelta(days=1) db.connect() new_aufmachers = Aufmacher.select()\ .where(Aufmacher.first_released > select_from_datetime) date_string = arrow.now().format('D. MMMM YYYY', locale='de') subject = "ZON zum {}".format(date_string) mail_html = None mail_text = None with open("mail_template.jinja2") as mjml_template_file: mail_mjml_template = Template(mjml_template_file.read()) mail_mjml = mail_mjml_template.render({ "aufmacher": new_aufmachers, "subject": subject, "date_string": date_string, }) with open("text_template.jinja2") as text_template_file: mail_text_template = Template(text_template_file.read()) mail_text = mail_text_template.render({ "aufmacher": new_aufmachers, "subject": subject, "date_string": date_string, }) if mail_mjml: mjml_filename = os.path.join(current_dir, "tmp/mail.mjml") mjmp_exec_path = os.path.join(current_dir, "node_modules/.bin/mjml") try: os.remove(mjml_filename) except OSError: pass with open(mjml_filename, "w") as mjml_output_file: mjml_output_file.write(mail_mjml) try: mail_html = subprocess.check_output( [mjmp_exec_path, mjml_filename]) mail_html = mail_html.decode("utf-8") except subprocess.CalledProcessError: mail_html = None try: os.remove(mjml_filename) except OSError: pass if mail_html and mail_text and send: sg = sendgrid.SendGridAPIClient(sendgrid_key) data = { "personalizations": [{ "to": [{ "email": "*****@*****.**" }], "subject": subject }], "from": { "email": "*****@*****.**" }, "content": [{ "type": "text/plain", "value": mail_text }, { "type": "text/html", "value": mail_html }] } response = sg.client.mail.send.post(request_body=data) print(response.status_code) print(response.body) print(response.headers)
def populate_db(self, path): if not Track.table_exists(): db.create_tables([Track]) db.connect() self.recursive_walk(path)