def db_load_models(articles): models = dict() inv_index = InvertedIndex() num_articles = len(articles) db = database.connect_to_database() list_of_ids = ",".join([str(article.id) for article in articles]) query = "SELECT articleid,term,tf,count FROM articleswithterms WHERE articleid IN (%s)" % (list_of_ids) cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor) num_results = cur.execute(query) rows = cur.fetchall() model_values = dict() for row in rows: if model_values.has_key(row['articleid']): model_values[row['articleid']].append(row) else: model_values[row['articleid']] = [row] for article, index in zip(articles, xrange(num_articles)): if article.has_been_counted: print "Loading article " + str(index + 1) + "/" + str(num_articles) new_model = ArticleModel(article) new_model.from_db_values(model_values[article.id]) all_terms = new_model.terms.all_terms() inv_index.add_term_occurences(all_terms, article.id) models[article.id] = new_model cur.close() db.close() return models, inv_index
def count_terms_and_store(articles, store=True, title_weight=19, print_steps=False, leading_weight=1, stoplist_file="../stop_words"): if store: db = database.connect_to_database() models = dict() inv_index = InvertedIndex() num_articles = len(articles) for art, index in zip(articles, xrange(len(articles))): if print_steps: print "Counting terms of article " + str(index + 1) + "/" + str(num_articles) model = ArticleModel(art, title_weight, leading_weight, stoplist_file=stoplist_file) for term in model.count_terms(): inv_index.add_term_ocurrence(term, model.article.id) if store: model.db_save(db) models[art.id] = model """ This isn't needed anymore total_counts = sum([model.total_term_counts for model in models.values()]) if store: cur = db.cursor() query = "UPDATE terms_global SET totaltermcounts = totaltermcounts + %d WHERE id = 1" % (total_counts) cur.execute(query) """ return models, inv_index