Esempio n. 1
0
def compute_top_n_tokens_for_each_doc(top_n, first_id, last_id):

    models.connect_to_db(conf.DATABASE_FILENAME)
    cleaner = Cleaner()
    top_n_tokens_per_paper = {}

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = ids_to_query(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers_NR.select().where(
                models.Papers.id == paper_id)
            if DEBUG:
                print(paper_query)
                print(len(paper_query))

            if len(paper_query) > 0:
                paper_content = paper_query[0].paper_text
                pdf_name = paper_query[0].paper_name
                tokens = cleaner.tokenize(paper_content)
                token_frequencies = {}
                for token in tokens:
                    if token not in token_frequencies:
                        token_frequencies[token] = 1
                    else:
                        token_frequencies[token] = token_frequencies[token] + 1

                sorted_tokens = [(k, token_frequencies[k]) for k in sorted(
                    token_frequencies, key=token_frequencies.get, reverse=True)
                                 ]
                top_n_tokens_per_paper[pdf_name] = sorted_tokens[:top_n]

    models.close_connection()
    printer = Printer()
    printer.print_dict(top_n_tokens_per_paper)
Esempio n. 2
0
def insert_title_column_to_nr_nsw_table():
    models.connect_to_db(DATABASE_FILENAME)
    last_id_query = models.Papers_NR.select().order_by(models.Papers_NR.id.desc()).limit(1)
    first_id = 1
    last_id = last_id_query[0].id
    last_id = 100
    increments = 50

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = create_list_of_ids(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers_NR.select().where(models.Papers_NR.id == paper_id)
            
            if len(paper_query) > 0:
                paper_pdf_name = paper_query[0].pdf_name
                
                title = paper_pdf_name.split(".pdf")[0]
                print("Title before replace: {0}".format(title))
                title = title.replace("-", " ")
                print("Title after replace: {0}".format(title))
                new_entry = models.Papers_NR_NSW.create(id=paper_id,
                                                        pdf_name=paper_pdf_name,
                                                        paper_text=cleaned_content)
                new_entry.save()                
                

        counter += increments
        print("Number of documents cleaned: {0}".format(counter))
        print("Sleeping for one second ...")
        time.sleep(1)

    models.close_connection()
Esempio n. 3
0
def clean_papers_from_db():
    models.connect_to_db(DATABASE_FILENAME)
    last_id_query = models.Papers.select().order_by(
        models.Papers.id.desc()).limit(1)
    first_id = 1
    last_id = last_id_query[0].id
    increments = 10

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = create_list_of_ids(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers.select().where(
                models.Papers.id == paper_id)
            print(paper_query)
            print(len(paper_query))
            if (len(paper_query) > 0):
                paper_content = paper_query[0].paper_text
                paper_pdf_name = paper_query[0].pdf_name
                print("Removing reference section from paper id: {0}".format(
                    paper_id))
                new_content = remove_reference_section(paper_content)
                print("Saving new paper_text into papers_for_index")
                new_entry = models.Papers_NR.create(id=paper_id,
                                                    pdf_name=paper_pdf_name,
                                                    paper_text=new_content)
                print("Number of rows modified: {0}".format(new_entry.save()))

        print("Sleeping for one second ...")
        time.sleep(1)

    models.close_connection()
def save_cleaned_files(input_directory):
    filenames = io.list_files_in_dir(input_directory)
    models.connect_to_db(conf.DATABASE_FILENAME)
    for filename in filenames:
        print("Saving {} into DB".format(filename))
        paper_id = extract_id_from_filename(filename)
        paper_pdf_name = create_pdf_filename(filename)
        file_path = conf.IRREGULAR_PAPERS_DIRECTORY + "/" + filename
        paper_content = io.load_file_rows(file_path)
        new_entry = models.Papers_NR.create(id=paper_id,
                                            pdf_name=paper_pdf_name,
                                            paper_text=paper_content)
        print("Number of rows modified: {0}".format(new_entry.save()))
        
    models.close_connection()
Esempio n. 5
0
def clean_papers():
    models.connect_to_db(conf.DATABASE_FILENAME)

    last_id_query = models.Papers_NR.select().order_by(
        models.Papers_NR.id.desc()).limit(1)
    first_id = 1
    last_id = last_id_query[0].id
    increments = 50
    paper_cleaner = text_cleaner.Cleaner()

    counter = 0
    for i in range(first_id, last_id + 1, increments):
        papers_to_process = create_list_of_ids(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers_NR.select().where(
                models.Papers_NR.id == paper_id)

            if len(paper_query) > 0:
                paper_content = paper_query[0].paper_text
                paper_pdf_name = paper_query[0].pdf_name
                title = paper_pdf_name.split(".pdf")[0]
                title = title.replace("-", " ")
                #print("Cleaning content for paper id: {0}".format(paper_id))

                rows_to_clean = paper_content.split("\n")
                cleaned_content = ""
                cleaner = text_cleaner.Cleaner()
                for row in rows_to_clean:
                    cleaned_row = cleaner.clean_text(row)
                    if len(cleaned_row) > 0:
                        cleaned_content += cleaned_row + "\n"

                #print("Saving new paper_text into papers_NR_NSW")
                new_entry = models.Papers_NR_NSW.create(
                    id=paper_id,
                    pdf_name=paper_pdf_name,
                    paper_text=cleaned_content,
                    paper_title=title)
                new_entry.save()
                #print("Number of rows modified: {0}".format(new_entry.save()))

        counter += increments
        print("Number of documents cleaned: {0}".format(counter))

        print("Sleeping for one second ...")
        time.sleep(1)

    models.close_connection()
def add_data(text):
    paper_text = text
    #new_paper_text = stemming(paper_text)
    models.connect_to_db(conf.DATABASE_FILENAME)
    last_id_query = models.Papers_NR_NSW_STE.select().order_by(
        models.Papers_NR_NSW_STE.id.desc()).limit(1)
    first_id = 1
    last_id = last_id_query[0].id
    increments = 10
    for i in range(first_id, last_id + 1, increments):
        papers_to_process = create_list_of_ids(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers_NR_NSW_STE.select().where(
                models.Papers_NR_NSW_STE.id == paper_id)
            paper_pdf_name = paper_query[0].pdf_name
            # is update the statement to use?
            new_entry = models.Papers_NR_NSW.update(id=paper_id,
                                                    pdf_name=paper_pdf_name,
                                                    paper_text=new_paper_text)
            print("Number of rows modified: {0}".format(new_entry.save()))
    models.close_connection()
Esempio n. 7
0
def compute_document_frequencies():

    models.connect_to_db(conf.DATABASE_FILENAME)
    first_id = 1
    last_id_query = papers.select().order_by(papers.id.desc()).limit(1)
    last_id = last_id_query[0].id
    increments = 10

    token_frequencies = {}

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = ids_to_query(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = papers.select().where(papers.id == paper_id)

            unique_tokens = set()
            
            if DEBUG:
                print(paper_query)
                print(len(paper_query))

            if len(paper_query) > 0:
                paper_content = paper_query[0].paper_text
                paper_pdf_name = paper_query[0].pdf_name
                tokens = paper_content.strip().split()
                for token in tokens:
                    #print(token)
                    unique_tokens.add(token.lower())

                for i, token in enumerate(unique_tokens):
                    #print(token)
                    if token not in token_frequencies:
                        token_frequencies[token] = 1
                    else:
                        token_frequencies[token] = token_frequencies[token] + 1
                
    models.close_connection()
    sorted_tokens = [(k, token_frequencies[k]) for k in sorted(token_frequencies, key=token_frequencies.get)]
    printer = Printer()
    printer.print_token_frequency(sorted_tokens)
Esempio n. 8
0
def compute_top_n_tokens_for_collection(top_n):

    models.connect_to_db(conf.DATABASE_FILENAME)
    first_id = 1
    last_id_query = models.Papers_NR.select().order_by(
        models.Papers_NR.id.desc()).limit(1)
    last_id = last_id_query[0].id
    increments = 10

    cleaner = Cleaner()
    token_frequencies = {}

    for i in range(first_id, last_id + 1, increments):
        papers_to_process = ids_to_query(i, increments, last_id)
        for paper_id in papers_to_process:
            paper_query = models.Papers.select().where(
                models.Papers.id == paper_id)

            if DEBUG:
                print(paper_query)
                print(len(paper_query))

            if len(paper_query) > 0:
                paper_content = paper_query[0].paper_text
                paper_pdf_name = paper_query[0].pdf_name
                tokens = cleaner.tokenize(paper_content)
                for token in tokens:
                    if token not in token_frequencies:
                        token_frequencies[token] = 1
                    else:
                        token_frequencies[token] = token_frequencies[token] + 1

    models.close_connection()
    sorted_tokens = [(k, token_frequencies[k]) for k in sorted(
        token_frequencies, key=token_frequencies.get, reverse=True)]
    top_n_tokens = sorted_tokens[:top_n]
    printer = Printer()
    printer.print_token_frequency(top_n_tokens)
Esempio n. 9
0
def retrieve_papers():
    io.create_directory(conf.IRREGULAR_PAPERS_DIRECTORY)
    models.connect_to_db(conf.DATABASE_FILENAME)

    # retrieve papers with reference not separated
    query = models.Papers.select().where(
        models.Papers.id.in_(conf.ids_reference_not_separated))
    for paper in query:
        filename = create_paper_newname(conf.IRREGULAR_PAPERS_DIRECTORY,
                                        paper.pdf_name)
        content = paper.paper_text
        io.save_file(content, filename)

    # retrieve papers with poorly defined reference section
    query = models.Papers.select().where(
        models.Papers.id.in_(conf.ids_poorly_defined_reference))
    for paper in query:
        filename = create_paper_newname(conf.IRREGULAR_PAPERS_DIRECTORY,
                                        paper.pdf_name)
        content = paper.paper_text
        io.save_file(content, filename)

    models.close_connection()
Esempio n. 10
0
def drop_papers_nr_nsw_table():
    models.connect_to_db(conf.DATABASE_FILENAME)
    models.Papers_NR_NSW_STE.drop_table()
    models.close_connection()
Esempio n. 11
0
def drop_papers_nr_table():
    models.connect_to_db(DATABASE_FILENAME)
    models.Papers_NR.drop_table()
    models.close_connection()