def classify_news() -> tp.Any: s = session() unclassified: tp.List[tp.Tuple[int, str]] = [ (i.id, naive_bayes.stemmer.clear(i.title)) for i in s.query(News).filter(News.label == None).all() ] X: tp.List[str] = [i[1] for i in unclassified] if not pathlib.Path( f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle" ).is_file(): raise ValueError( "Classifier is untrained! Please mark enough news to adequately train the model and run bayes.py to save it." ) with open( f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle", "rb") as model_file: model = naive_bayes.bayes.NaiveBayesClassifier(alpha=0.1) model = pickle.load(model_file) labels = model.predict(X) for i, e in enumerate(unclassified): extract = s.query(News).filter( News.id == e[0]).first() # only one such news extract exists extract.label = labels[i] s.commit() rows = s.query(News).filter(News.label != None).order_by(News.label).all() return template("classified_template", rows=rows)
def add_label() -> tp.Any: s = session() entry = s.query(News).filter( News.id == request.query["id"]).first() # only one such row exists entry.label = request.query["label"] s.commit() redirect("/news")
def save_data(pages: int = 1) -> None: news = scraputils.get_news("https://news.ycombinator.com/newest", pages) s = db.session() for i in news: obj = db.News( title=i["title"], author=i["author"], url=i["url"], comments=i["comments"], points=i["points"], ) print(f"Preparing news extract {i}, id: {obj.id}, title: {obj.title}") s.add(obj) s.commit() print(f"Committed extract {i} at id {obj.id}")
def update_news() -> tp.Any: new_arrivals = get_news("https://news.ycombinator.com/newest") s = session() marker = s.query(News).first() # Isaac Clarke's nightmare batch_size: int = 30 # there are 30 news entries per page. Just trust me on that. for i, e in enumerate(new_arrivals): if e["title"] == marker.title and e["author"] == marker.author: batch_size = i new_arrivals = new_arrivals[:batch_size] for entry in new_arrivals: obj = News( title=entry["title"], author=entry["author"], url=entry["url"], comments=entry["comments"], points=entry["points"], ) s.add(obj) s.commit() redirect("/news")
])) class_accuracies[c] = true_positives / (true_positives + false_negatives) score: float = sum([i for i in class_accuracies.values()]) / len( list(set(y_test))) return score if __name__ == "__main__": if not pathlib.Path( f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle" ).is_file(): print("No model detected. Creating new model...") model = NaiveBayesClassifier(alpha=0.1) print("Extracting marked news from database...") s = session() classified = [(i.title, i.label) for i in s.query(News).filter(News.label != None).all()] X_train, y_train = [], [] for label, extract in classified: X_train.append(label) y_train.append(extract) X_train = [naive_bayes.stemmer.clear(x).lower() for x in X_train] print(f"Extracted {len(X_train)} marked news") print("Training model...") model.fit(X_train, y_train) print("Model retrained. Saving...") with open( f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle", "wb") as model_file: pickle.dump(model, model_file)
def news_list() -> tp.Any: s = session() rows = s.query(News).filter(News.label == None).all() return template("news_template", rows=rows)