コード例 #1
0
def compute_document_frequencies():

    models.connect_to_db(conf.DATABASE_FILENAME)
    first_id = 1
    last_id_query = papers.select().order_by(papers.id.desc()).limit(1)
    last_id = last_id_query[0].id
    increments = 10

    token_frequencies = {}

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = ids_to_query(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = papers.select().where(papers.id == paper_id)

            unique_tokens = set()
            
            if DEBUG:
                print(paper_query)
                print(len(paper_query))

            if len(paper_query) > 0:
                paper_content = paper_query[0].paper_text
                paper_pdf_name = paper_query[0].pdf_name
                tokens = paper_content.strip().split()
                for token in tokens:
                    #print(token)
                    unique_tokens.add(token.lower())

                for i, token in enumerate(unique_tokens):
                    #print(token)
                    if token not in token_frequencies:
                        token_frequencies[token] = 1
                    else:
                        token_frequencies[token] = token_frequencies[token] + 1
                
    models.close_connection()
    sorted_tokens = [(k, token_frequencies[k]) for k in sorted(token_frequencies, key=token_frequencies.get)]
    printer = Printer()
    printer.print_token_frequency(sorted_tokens)
コード例 #2
0
def compute_top_n_tokens_for_collection(top_n):

    models.connect_to_db(conf.DATABASE_FILENAME)
    first_id = 1
    last_id_query = models.Papers_NR.select().order_by(
        models.Papers_NR.id.desc()).limit(1)
    last_id = last_id_query[0].id
    increments = 10

    cleaner = Cleaner()
    token_frequencies = {}

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = ids_to_query(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers.select().where(
                models.Papers.id == paper_id)

            if DEBUG:
                print(paper_query)
                print(len(paper_query))

            if len(paper_query) > 0:
                paper_content = paper_query[0].paper_text
                paper_pdf_name = paper_query[0].pdf_name
                tokens = cleaner.tokenize(paper_content)
                for token in tokens:
                    if token not in token_frequencies:
                        token_frequencies[token] = 1
                    else:
                        token_frequencies[token] = token_frequencies[token] + 1

    models.close_connection()
    sorted_tokens = [(k, token_frequencies[k]) for k in sorted(
        token_frequencies, key=token_frequencies.get, reverse=True)]
    top_n_tokens = sorted_tokens[:top_n]
    printer = Printer()
    printer.print_token_frequency(top_n_tokens)