コード例 #1
0
def save_data(df, db_connection):
    """
    Write dataframe to SQL database
    Parameters
    ----------
    df : dataframe
    db_connection : database connection

    """
    try:
        connection = db_connection.connect()
        # empty transition 'temptable' table
        connection.execute("TRUNCATE temptable")
        # write result to 'temptable' table
        df.to_sql('temptable',
                  db_connection,
                  if_exists='append',
                  index_label='id')
        # use INSERT IGNORE to just skip already existing letters in the 'list' table
        connection.execute("INSERT IGNORE INTO list SELECT * FROM temptable")
        connection.close()
    except Exception as e:
        ut.error("Failed to write dataframe to database.")
        ut.progress(f"Detailed error:\n{e}")
        return 0
    else:
        ut.progress("Dataframe written to database.")
        return 1
コード例 #2
0
def get_letter_list(db_connection, year, office):
    """
    Load data of letters from database to dataframe.

    Parameters
    ----------
    db_connection
    year: list
        a list of years to include in the query
    office: list
        a list of offices to include in the query

    Returns
    -------
    Dataframe with all letters loaded from database

    """
    # compose query
    query = f"WHERE YEAR(issued) IN ({', '.join(year)}) AND office REGEXP '{'|'.join(office)}'"
    
    # execute query and load to dataframe
    try:
        connection = db_connection.connect()
        df = pd.read_sql(f"SELECT * FROM list {query};", connection)
    except Exception as e:
        ut.error("Failed to retrieve dataframe from database.")
        ut.progress(f"Detailed error:\n{e}")
        return 0
    else:
        #ut.progress("Data retrieved from database.")
        return df
コード例 #3
0
def save_letter(letter, filename, folder):
    """
    Write letter to txt file

    Parameters
    ----------
    letter : String
        trimmed text ready for writing to file
    filename : String
        filename without ".txt"
    folder : String
        name of existing folder

    Returns
    -------
    int
        1 in case of success, 0 in case of failure

    """
    file = f'{folder}/{filename}.txt'
    if exists(file):
        ut.error(f"Target file {filename} already exists.")
        return 0
    try:
        f = open(file, 'w')
    except:
        ut.error(f"Cannot create or open {filename}.")
    else:
        f.write(letter)
        f.close()
        ut.progress(f"Successfully written letter to {filename}.")
    return 1
コード例 #4
0
def crawl_site(url, page_count_max, skip_page=0):
    """
    Crawl website by navigating through different pages.

    Parameters
    ----------
    url : String
        first webpage
    page_count_max : int
        maximum number of pages to crawl

    Returns
    -------
    site_df: Dataframe

    """
    driver = make_driver()
    driver.get(url)
    # wait 2 seconds for page to load
    time.sleep(2)
    page_count = 1
    # if we need to skip pages
    while skip_page > 0:
        go_next_page(driver)
        skip_page -= 1
        time.sleep(2)
    # crawl first page and store data in new site dataframe
    page_data = crawl_page(driver)
    df_columns = ['posted', 'issued', 'company', 'office', 'subject', 'link']
    site_df = pd.DataFrame(
        page_data,
        columns=df_columns,
        index=[get_unique_id(link) for link in page_data['link']])
    ut.progress("First page crawled and saved in dataframe.")
    # navigate to second and subsequent pages up to page_count_max
    while go_next_page(driver) and page_count < page_count_max:
        page_count += 1
        # wait 2 seconds to not overload server
        time.sleep(2)
        # crawl page and append data to site dataframe
        page_data = crawl_page(driver)
        page_df = pd.DataFrame(
            page_data,
            columns=df_columns,
            index=[get_unique_id(link) for link in page_data['link']])
        site_df = site_df.append(page_df)
        ut.progress(f"Page {page_count} crawled and saved in dataframe.")
    return site_df
コード例 #5
0
    def endElement(self, tag):
        if (tag == "page"):
            self.pageCount += 1
            total = 19797

            progress(self.pageCount, total, status='Creating Index!!')
            # if(self.pageCount%1000==0 or self.pageCount>19600 ):
            # 	print(self.pageCount, self.pp.totalTokens)

            # print(self.bufID)
            # print(self.bufTitle)
            # print(self.bufText)

            self.id = False
            self.firstID = True

            ## build you index here page by page

            titleTokens = self.pp.processTitle(self.bufTitle)
            infoboxTokens, catgoriesTokens, referencesTokens, bodyTokens, externalLinksTokens = self.pp.processText(
                self.bufText)

            self.invertedIndex.buildIndex(self.bufID, titleTokens,
                                          infoboxTokens, catgoriesTokens,
                                          referencesTokens, bodyTokens,
                                          externalLinksTokens)

            self.bufID = ""
            self.bufTitle = ""
            self.bufText = ""

        elif (tag == "id"):
            self.id = False
        if (tag == "title"):
            self.title = False
        elif (tag == "text"):
            self.text = False
        elif (tag == "mediawiki"):
            print("\nTotal Tokens :", self.pp.totalTokens)
            statFile = open(utility.getIndexPath(), 'w+')
            print("inverted_stat.txt path :", utility.getIndexPath())
            statFile.write("Total Tokens :" + str(self.pp.totalTokens) + '\n')

            statFile.close()
コード例 #6
0
def get_letter_list(db_connection):
    """
    Load data of letters from database to dataframe.

    Parameters
    ----------
    db_connection 

    Returns
    -------
    Dataframe with all letters loaded from database

    """
    try:
        connection = db_connection.connect()
        df = pd.read_sql("SELECT * FROM list;", connection)
    except Exception as e:
        ut.error("Failed to retrieve dataframe from database.")
        ut.progress(f"Detailed error:\n{e}")
        return 0
    else:
        ut.progress("Data retrieved from database.")
        return df
コード例 #7
0
    def writeIndex(self, fileName):
        self.totalInvertedTokens = len(self.index)
        print("totalInvertedTokens::", self.totalInvertedTokens)

        statFile = open(utility.getStatPath(), 'a+')
        statFile.write("Total Inverted Tokens :" +
                       str(self.totalInvertedTokens) + '\n')
        statFile.close()

        indexSize = len(self.index)
        j = 0
        file = open(fileName, 'w+')
        for key in self.index:
            file.write(key)
            file.write(';')
            for element in self.index[key]:
                for i in range(0, 6):
                    if (i == 0 and element[i] != 0):
                        file.write('d' + str(element[i]))
                    elif (i == 1 and element[i] != 0):
                        file.write('t' + str(element[i]))
                    elif (i == 2 and element[i] != 0):
                        file.write('i' + str(element[i]))
                    elif (i == 3 and element[i] != 0):
                        file.write('c' + str(element[i]))
                    elif (i == 4 and element[i] != 0):
                        file.write('r' + str(element[i]))
                    elif (i == 5 and element[i] != 0):
                        file.write('b' + str(element[i]))
                    elif (i == 6 and element[i] != 0):
                        file.write('e' + str(element[i]))
                file.write(';')
            file.write('\n')
            progress(j, indexSize, 'writing index!!')
            j += 1
        file.close()