def __init__ (self, conn, HTML_page_Obj=None, article_file_path=None): if HTML_page_Obj==None and article_file_path!=None: make_from_file(article_file_path) elif HTML_page_Obj!=None and article_file_path==None: self.article_url=HTML_page_Obj.url website_base_url=extraction_text_manip.extract_website(self.article_url) print "\n Making article found on %s"%website_base_url self.article_headline="" self.article_alt_headline_list=[] self.article_text="" self.article_date=datetime.date(1970, 1, 1) ## Start of UNIX time self.article_time=datetime.time(0,0) article_soup=HTML_page_Obj.html_prettify() table = sqliteDefaults.verified_select_sqlite(conn=conn, select_query="select * from website_regex where base_url='%s' order by date_of_addition desc;" %website_base_url) ##<-CHANGED if table is not None: for i in range(0,len(table)): try_code=table[i][1] # print try_code try: exec(try_code) except Exception: print "Something went wrong while executing that code. Trying next code..." else: self.article_headline=article_headline.strip() self.article_alt_headline_list=article_alt_headline_list self.article_text=article_text.strip() self.article_date = article_date self.article_time = article_time return print "None of the extraction codes for %s have worked on this article. \ Please re-check them."%website_base_url
def __init__(self, conn, HTML_page_Obj=None, article_file_path=None): if HTML_page_Obj == None and article_file_path != None: make_from_file(article_file_path) elif HTML_page_Obj != None and article_file_path == None: self.article_url = HTML_page_Obj.url website_base_url = extraction_text_manip.extract_website( self.article_url) print "\n Making article found on %s" % website_base_url self.article_headline = "" self.article_alt_headline_list = [] self.article_text = "" self.article_date = datetime.date(1970, 1, 1) ## Start of UNIX time self.article_time = datetime.time(0, 0) article_soup = HTML_page_Obj.html_prettify() table = sqliteDefaults.verified_select_sqlite( conn=conn, select_query= "select * from website_regex where base_url='%s' order by date_of_addition desc;" % website_base_url) ##<-CHANGED if table is not None: for i in range(0, len(table)): try_code = table[i][1] # print try_code try: exec(try_code) except Exception: print "Something went wrong while executing that code. Trying next code..." else: self.article_headline = article_headline.strip() self.article_alt_headline_list = article_alt_headline_list self.article_text = article_text.strip() self.article_date = article_date self.article_time = article_time return print "None of the extraction codes for %s have worked on this article. \ Please re-check them." % website_base_url
ResultPageNumber INTEGER, URL TEXT, ResultNumber INTEGER, PRIMARY KEY(Topic, URL) ); ''' % db_table_name) conn.commit() initial_start_date = to_julian_date_datetime(datetime.now().date()) start_date = 0 end_date = 0 results_sorted_by_date_query = "Select StartDate, EndDate from %s WHERE Topic='%s' ORDER BY StartDate ASC" % ( db_table_name, topic) Urls_sorted_by_date = sqliteDefaults.verified_select_sqlite( conn, results_sorted_by_date_query, printing=False) if len(Urls_sorted_by_date) == 0: end_date = initial_start_date print "\n\tNo results in database on the topic %s\n" % (topic) else: last_extracted_date = Urls_sorted_by_date[0][ 0] ## Gets the StartDate of the last extracted URL end_date = last_extracted_date ## We must resume googlesearching from this date if resume_from != -1: end_date = resume_from ## resume_from should be set to the start date of the latest period in which no urls were extracted. NOTE: once you find a time period in which you DO get some UNIQUE urls, you should not use resume_from in subsequent runs. num_time_periods_passed = int( round((initial_start_date - end_date) / time_period)) ## WHATEVER YOU DO, THE PRODUCT OF time_period*
PRIMARY KEY(Topic, URL) ); '''%db_table_name) conn.commit() initial_start_date = to_julian_date_datetime(datetime.now().date()) start_date=0 end_date=0 results_sorted_by_date_query = "Select StartDate, EndDate from %s WHERE Topic='%s' ORDER BY StartDate ASC"%(db_table_name, topic) Urls_sorted_by_date = sqliteDefaults.verified_select_sqlite(conn, results_sorted_by_date_query, printing=False) if len(Urls_sorted_by_date) == 0: end_date = initial_start_date print "\n\tNo results in database on the topic %s\n"%(topic) else: last_extracted_date = Urls_sorted_by_date[0][0] ## Gets the StartDate of the last extracted URL end_date = last_extracted_date ## We must resume googlesearching from this date if resume_from != -1: end_date = resume_from ## resume_from should be set to the start date of the latest period in which no urls were extracted. NOTE: once you find a time period in which you DO get some UNIQUE urls, you should not use resume_from in subsequent runs. num_time_periods_passed = int(round((initial_start_date - end_date)/time_period)) ## WHATEVER YOU DO, THE PRODUCT OF time_period* num_time_periods_remaining = num_time_periods_remaining - num_time_periods_passed
conn = sqliteDefaults.get_conn("article_extract_db.db") conn.execute('''CREATE TABLE IF NOT EXISTS `articles_clean` ( `company_or_sector` TEXT, `article_url` TEXT, PRIMARY KEY(company_or_sector, article_url) ); ''') conn.commit() company_name = 'Infosys' articles = sqliteDefaults.verified_select_sqlite(conn, "SELECT DISTINCT article_url, company_name, article_headline, article_text, article_date \ FROM articles \ WHERE company_name='%s' \ and article_url not in (select article_url from articles_clean)\ ORDER BY article_url ASC\ "%(company_name) ) conn2 = sqliteDefaults.get_conn("extracted_search_urls.db") company_dict = {} temp_table = sqliteDefaults.verified_select_sqlite(conn2,"SELECT DISTINCT ArticleTopic from articleUrls order by ArticleTopic asc") for i in range(0,len(temp_table)): company_dict[i+1]=temp_table[i][0] company_dict[100]="Financial Services sector" company_dict[200]="IT sector" company_dict[300]="Energy sector"