def compute_document_frequencies(): models.connect_to_db(conf.DATABASE_FILENAME) first_id = 1 last_id_query = papers.select().order_by(papers.id.desc()).limit(1) last_id = last_id_query[0].id increments = 10 token_frequencies = {} for i in range(first_id, last_id + 1, increments): papers_to_process = ids_to_query(i, increments, last_id) for paper_id in papers_to_process: paper_query = papers.select().where(papers.id == paper_id) unique_tokens = set() if DEBUG: print(paper_query) print(len(paper_query)) if len(paper_query) > 0: paper_content = paper_query[0].paper_text paper_pdf_name = paper_query[0].pdf_name tokens = paper_content.strip().split() for token in tokens: #print(token) unique_tokens.add(token.lower()) for i, token in enumerate(unique_tokens): #print(token) if token not in token_frequencies: token_frequencies[token] = 1 else: token_frequencies[token] = token_frequencies[token] + 1 models.close_connection() sorted_tokens = [(k, token_frequencies[k]) for k in sorted(token_frequencies, key=token_frequencies.get)] printer = Printer() printer.print_token_frequency(sorted_tokens)
def compute_top_n_tokens_for_collection(top_n): models.connect_to_db(conf.DATABASE_FILENAME) first_id = 1 last_id_query = models.Papers_NR.select().order_by( models.Papers_NR.id.desc()).limit(1) last_id = last_id_query[0].id increments = 10 cleaner = Cleaner() token_frequencies = {} for i in range(first_id, last_id + 1, increments): papers_to_process = ids_to_query(i, increments, last_id) for paper_id in papers_to_process: paper_query = models.Papers.select().where( models.Papers.id == paper_id) if DEBUG: print(paper_query) print(len(paper_query)) if len(paper_query) > 0: paper_content = paper_query[0].paper_text paper_pdf_name = paper_query[0].pdf_name tokens = cleaner.tokenize(paper_content) for token in tokens: if token not in token_frequencies: token_frequencies[token] = 1 else: token_frequencies[token] = token_frequencies[token] + 1 models.close_connection() sorted_tokens = [(k, token_frequencies[k]) for k in sorted( token_frequencies, key=token_frequencies.get, reverse=True)] top_n_tokens = sorted_tokens[:top_n] printer = Printer() printer.print_token_frequency(top_n_tokens)