def parse_products(): date_latest = utils.parse_latest_date(PRODUCTS_DIR) products_latest = f"{PRODUCTS_DIR}/{date_latest}" categories = os.listdir(products_latest) categories = [c for c in categories if not c.startswith(".")] categories_with_specs = ( "desktops", "notebooks", ) products = [] used_products = set() for category in categories_with_specs: category_products = load_json( f"{SPECS_DIR}/{date_latest}/{category}-list.json") used_products.update({p["source_id"] for p in category_products}) products.extend(category_products) for category in categories: category_products = load_json(f"{products_latest}/{category}") category_products = [p for p in category_products] products.extend(category_products) with open(f"{DB_DUMPS_DIR}/products.json", "w") as f: f.write(json.dumps(products, ensure_ascii=False))
def parse_reviews(): date_latest = utils.parse_latest_date(REVIEWS_DIR) reviews_latest = f"{REVIEWS_DIR}/{date_latest}" categories = os.listdir(reviews_latest) categories = [c for c in categories if not c.startswith(".")] reviews = [] for category in categories: products = os.listdir(f"{reviews_latest}/{category}") products = [p for p in products if not p.startswith(".")] for product in products: product_id = product[: product.index(".json")] product_reviews = load_json(f"{reviews_latest}/{category}/{product}") for review in product_reviews["data"]: review_dict = { "product_id": product_id, "source_id": review["id"], "date": datetime.strptime(review["date"], "%d.%m.%Y"), "rating": review["rating"], "comment_plus": review["comment"]["plus"], "comment_minus": review["comment"]["minus"], "comment_text": review["comment"]["text"], } review_rating = review["feedback"]["reviewsRating"] approved, rated = _parse_approved_rated(review_rating) review_dict.update({"review_approved": approved, "review_rated": rated}) reviews.append(review_dict) with open(f"{DB_DUMPS_DIR}/reviews.json", "wb") as f: f.write(orjson.dumps(reviews))
def download_tweet_sets(): if "session_id" in session: return redirect(url_for("test_tweets")) if os.path.exists(app.config['TWEETS_SETS_FILE']): data = load_json(app.config['TWEETS_SETS_FILE']) else: data = [] form = TweetsSetDownloadForm() if form.validate_on_submit(): new_set_id = form.set_name.data.replace(" ", "_") + "_" + str( int(datetime.timestamp(datetime.now()))) new_set = { "id": new_set_id, "set_name": form.set_name.data, "search_query": form.search_query.data, "tweets_number": int(form.tweets_number.data), "bufale_pages": int(form.bufale_pages.data) } create_tweets_set(new_set_id, form.search_query.data, int(form.tweets_number.data), int(form.bufale_pages.data)) data.append(new_set) write_json(app.config['TWEETS_SETS_FILE'], data) return redirect(url_for("download_tweet_sets")) return render_template("tweets_set_download.html", form=form, tweets_sets=data[::-1])
def start_test(): if "session_id" in session: return redirect(url_for("test_tweets")) if os.path.exists(app.config['SESSIONS_FILE']): data = load_json(app.config['SESSIONS_FILE']) else: data = [] # creating the choices for the select field if os.path.exists(app.config['TWEETS_SETS_FILE']): tweets_sets = load_json(app.config['TWEETS_SETS_FILE']) else: return redirect(url_for("download_tweet_sets")) tweets_sets = [(ts['id'], ts['id'] + " (" + ts["search_query"] + ")") for ts in tweets_sets[::-1]] form = UserForm() form.tweets_set_to_use.choices = tweets_sets if form.validate_on_submit(): new_session_id = form.username.data.replace(" ", "_") + "_" + str( int(datetime.timestamp(datetime.now()))) new_session = { "id": new_session_id, "username": form.username.data, "age": form.age.data, "gender": form.gender.data } data.append(new_session) write_json(app.config['SESSIONS_FILE'], data) session["session_id"] = new_session_id session["username"] = form.username.data session["age"] = form.age.data session["gender"] = form.gender.data session["tweets_set_id"] = form.tweets_set_to_use.data session["start_timestamp"] = int(datetime.timestamp(datetime.now())) return redirect(url_for("test_tweets")) return render_template("start_test.html", form=form)
def parse_categories(): date_latest = utils.parse_latest_date(f"{PRODUCTS_DIR}") products_latest = f"{PRODUCTS_DIR}/{date_latest}" categories = os.listdir(products_latest) categories = [c for c in categories if not c.startswith(".")] categories_with_specs = ( "desktops", "notebooks", ) categories_set = set() used_categories = set() for category in categories_with_specs: category_products = load_json( f"{SPECS_DIR}/{date_latest}/{category}-list.json") for product in category_products: categories_set.add( (product["category_name"], product["category_id"])) used_categories.add(product["category_id"]) for category in categories: products = load_json(f"{products_latest}/{category}") for product in products: if product["category_id"] in used_categories: continue name = product["category_name"].replace("%20", " ") categories_set.add((name, product["category_id"])) categories_sorted = sorted(categories_set, key=lambda x: (x[0], x[1])) categories_dict = [{ "name": c[0], "source_id": c[1] } for c in categories_sorted] with open(f"{DB_DUMPS_DIR}/categories.json", "w") as f: json.dump(categories_dict, f)
def start_requests(self): parsed_ids = db_utils.get_dumped_product_details() products = load_json(self.products_json) for product in products: if product["source_id"] in parsed_ids: continue yield scrapy.Request( url=product["url"], cb_kwargs={"product": product}, callback=self.parse_product, ) parsed_ids.add(product["source_id"])
def parse_products(): date_latest = utils.parse_latest_date(PRODUCTS_DIR) products_latest = f"{PRODUCTS_DIR}/{date_latest}" categories = os.listdir(products_latest) categories = [c for c in categories if not c.startswith(".")] products = [] for category in categories: category_products = load_json(f"{products_latest}/{category}") products.extend(category_products) with open(f"{DB_DUMPS_DIR}/products.json", "w") as f: f.write(json.dumps(products, ensure_ascii=False))
def parse_specs(): date_latest = utils.parse_latest_date(SPECS_DIR) categories_with_specs = ( "desktops", "notebooks", ) specs = [] for category in categories_with_specs: products = load_json(f"{SPECS_DIR}/{date_latest}/{category}-specs.json") for product in products: specs.append(processed_specs(product)) with open(f"{DB_DUMPS_DIR}/specs.json", "w") as f: json.dump(specs, f, ensure_ascii=False)
def test_tweets(): if "session_id" not in session: return redirect(url_for("start_test")) tweets_set = load_json( os.path.join(app.config["TWEETS_SETS_DIR"], session["tweets_set_id"] + ".json")) class DynamicTestForm(TestForm): pass for t in tweets_set: field = RadioField(t["progressive"], choices=[("True", "True"), ("Maybe", "Maybe"), ("Fake", "Fake")], id=t["id"], validators=[InputRequired()]) setattr(DynamicTestForm, t["progressive"], field) form = DynamicTestForm() if form.validate_on_submit(): user_choices = {} for t in tweets_set: user_choices[t["id"]] = form[t["progressive"]].data user_session = { "id": session["session_id"], "username": session["username"], "age": session["age"], "gender": session["gender"], "tweets_set_id": session["tweets_set_id"], "start_timestamp": session["start_timestamp"], "finish_timestamp": int(datetime.timestamp(datetime.now())), "user_choices": user_choices } write_json( os.path.join(app.config['SESSIONS_DIR'], session["session_id"] + ".json"), user_session) return redirect(url_for("results")) # return test_tweets.html and list and length of list to html page return render_template("test_tweets.html", set_length=len(tweets_set), tweets=tweets_set, form=form)
def _insert_from_source(source_file, query): db = LocalSession() date = utils.parse_latest_date(DB_DUMPS_DIR) data: List[dict] = load_json(f"{DB_DUMPS_DIR}/{date}/{source_file}") db.bulk_insert_dicts(query, data)
""" A file full of constants. September 2, 2016 """ from app import utils settings = utils.load_json("data/settings.json") # ---------------------------- CONSTANTS ---------------------------- assignments = utils.load_json(settings['assignments_path']) students = utils.load_json(settings['students_path']) info = utils.load_json(settings['info_path'])
from flask import request, render_template, redirect, url_for from app import app, utils, recommender, plotting import os tsne_weights = utils.load_npy('data/tsne_weights.npy') embeddings = utils.load_npy('data/embeddings.npy') [d, inv_d] = utils.load_json('data/subreddit_dicts.json') @app.route('/', methods=['GET', 'POST']) def base(): dataset_tsne_3dplot = plotting.dataset_tsne_3dplot_subset( tsne_weights, d, 10000) if request.method == 'POST': input_subreddit = request.form['input_name'] num_recommendations = int(request.form['num_recommendations']) return redirect( url_for('recs_for_subreddit', subreddit=input_subreddit, num_recommendations=num_recommendations)) return render_template('base.html', plot=dataset_tsne_3dplot) @app.route('/<subreddit>', methods=['GET', 'POST']) def recs_for_subreddit(subreddit=None): num_recommendations = request.args.get('num_recommendations') num_recommendations = int( num_recommendations) if num_recommendations is not None else 10 num_recommendations = 10 if num_recommendations not in [ 5, 10, 15, 20 ] else num_recommendations