def browse(self, lineEdit, fileType): if fileType == "dir": fname = QFileDialog.getExistingDirectory(self, "Browse Directory", get_root_path(), QFileDialog.ShowDirsOnly) fname += "\\" else: fname, filter = QFileDialog.getOpenFileName( self, "Browse Files", get_root_path(), "Files (" + fileType + ")") lineEdit.setText(os.path.normpath(fname))
def main_excel(config_path, stopword_path, report_path, category, location, msgLabel, current_datetime): REPORT_NAME = category.replace(' ', '') + "JobReport_" + location.replace( ' ', '') + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".xlsx" msg = "" try: config_reader = cr.Csv_Reader(config_path) msg += "Start scraping at " + str( current_datetime ) + "\nReport path at " + report_path + REPORT_NAME + "\n" msgLabel.setText(msg) engine = se.Scraping_Engine(config_reader.get_data(), category, location, excel=True) engine.scrape_all() msg += "Scrape completed\nStart Excel report generation\n" msgLabel.setText(msg) excel_writer = ew.Excel_Writer(REPORT_NAME, report_path) excel_writer.write(engine.site_list, engine.company_list, engine.title_list, engine.description_list, engine.skills_list, engine.location_list, engine.link_list) msg += "Excel report generation completed" msgLabel.setText(msg) except Exception: log(get_root_path() + 'Log\\', traceback.format_exc(), current_datetime) raise
def main(config_path, stopword_path, report_path, category, location, msgLabel, current_datetime): REPORT_NAME = category.replace(' ', '') + "JobReport_" + location.replace( ' ', '') + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt" REPORT_TITLE = category + " Job Report in " + location + " on " + current_datetime.strftime( "%d/%m/%Y %H:%M:%S") KEYWORD_NAME = category.replace( ' ', '') + "JobKeyword_" + location.replace( ' ', '') + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt" KEYWORD_TITLE = category + " Job Keyword in " + location + " on " + current_datetime.strftime( "%d/%m/%Y %H:%M:%S") SEPARATOR = '=======================================================================================================================================================\n\n' msg = "" try: config_reader = cr.Csv_Reader(config_path) msg += "Start scraping at " + str( current_datetime ) + "\nReport path at " + report_path + REPORT_NAME + "\n" msgLabel.setText(msg) engine = se.Scraping_Engine(config_reader.get_data(), category, location) engine.scrape_all() report_writer = rw.Report_Writer(REPORT_NAME, report_path) report_writer.write_title(REPORT_TITLE) report_writer.write_list(engine.site_list, engine.title_list, engine.company_list, engine.location_list, engine.description_list, engine.link_list, [SEPARATOR] * len(engine.site_list)) msg += "Scrape completed\nStart keyword extraction\n" msgLabel.setText(msg) extractor = ke.KeywordExtractor(stopword_path, engine.description_list, engine.title_list, engine.company_list, engine.location_list, category + ' ' + location) extractor.extract_each_text() keyword_writer = rw.Report_Writer(KEYWORD_NAME, report_path) keyword_writer.write_title(KEYWORD_TITLE) keyword_writer.write_list(engine.site_list, engine.title_list, engine.company_list, engine.location_list, extractor.keyword_list, engine.link_list, [SEPARATOR] * len(engine.site_list)) extractor.extract_all_text() keyword_writer.write_text("Keywords of today's scraping: ") keyword_writer.write_list(extractor.keyword_list, [SEPARATOR] * len(extractor.keyword_list)) msg += "Keyword extraction completed" msgLabel.setText(msg) except Exception: log(get_root_path() + 'Log\\', traceback.format_exc(), current_datetime) raise
def __init__(self): # The "super" function initialises the child class using the initialisation of the parent class, super(ScrapingGUI, self).__init__() self.setWindowFlags(self.windowFlags() | QtCore.Qt.WindowSystemMenuHint | QtCore.Qt.WindowMinMaxButtonsHint) # Loads the UI loadUi(get_root_path() + 'src\\resources\\scraping.ui', self) # Basic button settings self.configLineEdit.setText(get_root_path() + 'Config\\job_ad_sites.csv') self.stopwordLineEdit.setText(get_root_path() + 'Config\\stopwords.txt') self.reportLineEdit.setText(get_root_path() + 'Report\\') # Browse for file self.configButton.clicked.connect(self.browse_config) self.stopwordButton.clicked.connect(self.browse_stopword) self.reportButton.clicked.connect(self.browse_report) self.startScrapeButton.clicked.connect(self.start_scrape) self.startWriteExcelButton.clicked.connect(self.start_excel)
class Test_Report_Writer(unittest.TestCase): TEST_FILENAME = 'Report_test.txt' TEST_PATH = get_root_path() + 'Report\\' TEST_TITLE = 'Test title' TEST_STRING1 = '1' TEST_STRING2 = '2' TEST_STRING3 = '3' TEST_STRING4 = '4' def setUp(self): self.test_report_writer = rw.Report_Writer(self.TEST_FILENAME, self.TEST_PATH) def test_write_title(self): self.test_report_writer.write_title(self.TEST_TITLE) expected_input = self.TEST_TITLE + '\n====================================================\n' with open(self.TEST_PATH + self.TEST_FILENAME, 'r', encoding='utf-8') as f: self.assertEqual(expected_input, f.read()) def test_write_list(self): test_list_1 = [self.TEST_STRING1, self.TEST_STRING3] test_list_2 = [self.TEST_STRING2, self.TEST_STRING4] self.test_report_writer.write_list(test_list_1, test_list_2) expected_input = 'Number: 1\n' + self.TEST_STRING1 + self.TEST_STRING2 + 'Number: 2\n' + self.TEST_STRING3 + self.TEST_STRING4 with open(self.TEST_PATH + self.TEST_FILENAME, 'r', encoding='utf-8') as f: self.assertEqual(expected_input, f.read()) def test_write_text(self): self.test_report_writer.write_text(self.TEST_TITLE) expected_input = self.TEST_TITLE + '\n' with open(self.TEST_PATH + self.TEST_FILENAME, 'r', encoding='utf-8') as f: self.assertEqual(expected_input, f.read()) def tearDown(self): os.remove(self.TEST_PATH + self.TEST_FILENAME)
def setUp(self): self.TEST_FILENAME = get_root_path() + 'Config\\stopwords_test.txt' with open(self.TEST_FILENAME, 'w+', encoding='utf-8') as f: f.write('hello\nworld') self.expected_stop_word_set = frozenset({'hello', 'world'}) self.test_list = [ 'hello world', 'this is a testing class', 'this', 'ends' ] self.omit_list = ['hello world'] * 4 row = np.array([0, 3, 1, 0]) col = np.array([0, 3, 1, 2]) data = np.array([4, 5, 7, 9]) self.test_matrix = coo_matrix((data, (row, col)), shape=(4, 4)) self.test_extractor = ke.KeywordExtractor( self.TEST_FILENAME, self.test_list, self.omit_list, self.omit_list, self.omit_list, self.omit_list[0]) self.actual_tuple = self.test_extractor._sort_coo(self.test_matrix)
def setUp(self): self.test_scraping_engine = se.Scraping_Engine(pd.read_csv(get_root_path() + 'Config\\job_ad_sites.csv'), self.TEST_CATEGORY, self.TEST_LOCATION)