Beispiel #1
0
def classify_news() -> tp.Any:
    s = session()
    unclassified: tp.List[tp.Tuple[int, str]] = [
        (i.id, naive_bayes.stemmer.clear(i.title))
        for i in s.query(News).filter(News.label == None).all()
    ]
    X: tp.List[str] = [i[1] for i in unclassified]
    if not pathlib.Path(
            f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle"
    ).is_file():
        raise ValueError(
            "Classifier is untrained! Please mark enough news to adequately train the model and run bayes.py to save it."
        )
    with open(
            f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle",
            "rb") as model_file:
        model = naive_bayes.bayes.NaiveBayesClassifier(alpha=0.1)
        model = pickle.load(model_file)
    labels = model.predict(X)
    for i, e in enumerate(unclassified):
        extract = s.query(News).filter(
            News.id == e[0]).first()  # only one such news extract exists
        extract.label = labels[i]
        s.commit()
    rows = s.query(News).filter(News.label != None).order_by(News.label).all()
    return template("classified_template", rows=rows)
Beispiel #2
0
def add_label() -> tp.Any:
    s = session()
    entry = s.query(News).filter(
        News.id == request.query["id"]).first()  # only one such row exists
    entry.label = request.query["label"]
    s.commit()
    redirect("/news")
Beispiel #3
0
def save_data(pages: int = 1) -> None:
    news = scraputils.get_news("https://news.ycombinator.com/newest", pages)
    s = db.session()
    for i in news:
        obj = db.News(
            title=i["title"],
            author=i["author"],
            url=i["url"],
            comments=i["comments"],
            points=i["points"],
        )
        print(f"Preparing news extract {i}, id: {obj.id}, title: {obj.title}")
        s.add(obj)
        s.commit()
        print(f"Committed extract {i} at id {obj.id}")
Beispiel #4
0
def update_news() -> tp.Any:
    new_arrivals = get_news("https://news.ycombinator.com/newest")
    s = session()
    marker = s.query(News).first()  # Isaac Clarke's nightmare
    batch_size: int = 30  # there are 30 news entries per page. Just trust me on that.
    for i, e in enumerate(new_arrivals):
        if e["title"] == marker.title and e["author"] == marker.author:
            batch_size = i
    new_arrivals = new_arrivals[:batch_size]
    for entry in new_arrivals:
        obj = News(
            title=entry["title"],
            author=entry["author"],
            url=entry["url"],
            comments=entry["comments"],
            points=entry["points"],
        )
        s.add(obj)
        s.commit()
    redirect("/news")
Beispiel #5
0
                    ]))
                class_accuracies[c] = true_positives / (true_positives +
                                                        false_negatives)
        score: float = sum([i for i in class_accuracies.values()]) / len(
            list(set(y_test)))
        return score


if __name__ == "__main__":
    if not pathlib.Path(
            f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle"
    ).is_file():
        print("No model detected. Creating new model...")
        model = NaiveBayesClassifier(alpha=0.1)
        print("Extracting marked news from database...")
        s = session()
        classified = [(i.title, i.label)
                      for i in s.query(News).filter(News.label != None).all()]
        X_train, y_train = [], []
        for label, extract in classified:
            X_train.append(label)
            y_train.append(extract)
        X_train = [naive_bayes.stemmer.clear(x).lower() for x in X_train]
        print(f"Extracted {len(X_train)} marked news")
        print("Training model...")
        model.fit(X_train, y_train)
        print("Model retrained. Saving...")
        with open(
                f"{os.path.dirname(os.path.realpath(__file__))}/../model/model.pickle",
                "wb") as model_file:
            pickle.dump(model, model_file)
Beispiel #6
0
def news_list() -> tp.Any:
    s = session()
    rows = s.query(News).filter(News.label == None).all()
    return template("news_template", rows=rows)