def logo_crawler_job_dispatcher(thread_num: int) -> None: with MySQLWrapper() as db: sql = """select c.company_id, c.logo_url from company_data_unclean as c inner join company_logo_downloaded as c2 on c.company_id = c2.company_id where c2.downloaded = false order by c.company_id limit 12 """ result = db.query_all(sql) for r in result: imgs_task_queue.put(r) downloading_threads: List[threading.Thread] = [] for i in range(thread_num): thread = threading.Thread(target=save_imgs_task) thread.setDaemon(False) downloading_threads.append(thread) for i in range(thread_num): downloading_threads[i].start() for i in range(thread_num): imgs_task_queue.put(None) for thread in downloading_threads: thread.join()
def save_imgs_task() -> None: with MySQLWrapper() as db: while True: url_task: Optional[Tuple[str, str]] = imgs_task_queue.get() if url_task is None: break company_id, url = url_task print(url) stored_path = os.path.join(saved_dir, str(company_id) + ".png") # print("saving image") # get_picture_content(url) picture_content = get_picture_content(url, anonymous=True) if picture_content is None: print(company_id, "failed") continue with open(stored_path, 'wb') as f: f.write(picture_content) sql = f"""update company_logo_downloaded as c set c.downloaded = true where c.company_id = {company_id} """ db.execute(sql) logo_path = os.path.join("company", str(company_id) + ".png") sql = f"""update company_data_unclean as c set c.logo_path = '{logo_path}' where c.company_id = {company_id} """ db.execute(sql) db.commit() print(f"downloaded {company_id} logo")
def init(): with MySQLWrapper() as db: sql1 = """CREATE TABLE IF NOT EXISTS company_logo_downloaded ( company_id VARCHAR(100) NOT NULL, downloaded BOOL DEFAULT FALSE, PRIMARY KEY (company_id) ) """ sql2 = """ insert into company_logo_downloaded(company_id) select company_id from company_data_unclean; """ db.execute(sql1) try: db.execute(sql2) db.commit() except IntegrityError as e: pass if not os.path.exists(saved_dir): os.makedirs(saved_dir)
def record_start_status(crawl_id: int, cur_log_file_path: str): start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") with MySQLWrapper() as db: sql = f"""insert into crawler_system_logging_table (crawl_id, start_time, in_progress, logging_file_path,succeeded) values ({crawl_id},'{start_time}',TRUE,'{cur_log_file_path}',FALSE)""" db.insert_one(sql)
def fetch_uncrawled_job_categories() -> List[str]: with MySQLWrapper() as db: sql = f"""select j.category from job_categories as j where j.crawled = FALSE order by j.category ASC""" job_list_in_tuple: Tuple[str] = db.query_all(sql) job_category_list = [job_tuple[0] for job_tuple in job_list_in_tuple] return job_category_list
def record_end_status(crawl_id: int): end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") with MySQLWrapper() as db: sql = f"""update crawler_system_logging_table c1 set c1.end_time = '{end_time}', c1.succeeded = TRUE, c1.in_progress = FALSE where c1.crawl_id = {crawl_id}""" db.execute(sql) db.commit()
def current_run_id() -> int: with MySQLWrapper() as db: sql = """ select max(crawl_id) from crawler_system_logging_table""" result = db.query_all(sql) if result[0][0] is None: return 1 else: return result[0][0] + 1
def update_crawled_job_categories(job_category: str) -> None: with MySQLWrapper() as db: sql = f"""update job_categories as j set j.crawled = true where j.category= '{job_category}' """ print(sql) db.execute(sql) db.commit()
def shc_helper(col: str, uid: str) -> List[str]: with MySQLWrapper() as cursor: to_return = [] query1 = f"""SELECT {col}, COUNT({col}) as count FROM search_history WHERE user_id = '{uid}' AND {col} <> '' GROUP BY {col} ORDER BY count DESC;""" cursor.execute(query1) result = cursor.fetchall() for row in result: for i in range(0, row[1]): to_return.append(row[0]) return to_return
def clean_company_and_insert(): with MySQLWrapper() as db: table_name = "company" db.delete_from(table_name, "1=1") sql = """insert into company select * from company_data_unclean as c2 where c2.company_name is not null and c2.logo_url is not null""" db.execute(sql) db.commit()
def clean_job_and_insert(): with MySQLWrapper() as db: table_name = "job" db.delete_from(table_name, "1=1") sql = """insert into job select * from job_data_unclean as j2 where j2.job_description is not null and j2.job_title is not null""" db.execute(sql) db.commit()
def get_by_browse_time(threshold: int, uid: str) -> List[int]: with MySQLWrapper() as cursor: query1 = f"""SELECT job_id FROM (((SELECT job_id, sum(time_elapsed) as total_time FROM browse_time WHERE user_id = '{uid}' GROUP BY job_id HAVING total_time >= {threshold}) as atable natural join job) natural join company)""" cursor.execute(query1) result = cursor.fetchall() to_return = [] for row in result: to_return.append(row[0]) return to_return
def label_jobs(min_freq=1): init_job_bag_of_words_table() with MySQLWrapper() as db: sql = "select j.job_id, j.job_title, j.job_description from job j" result: Tuple[int, str, str] = db.query_all(sql) for one_job in result: job_id, job_title, description = one_job terms = get_all_terms_in_doc_with_frequency(reg_exp, description, min_freq) insert_one_bag_of_word_repr_of_job(job_id, job_title, terms)
def get_a_useragent(): with MySQLWrapper() as db: query0 = "SELECT count(*) FROM user_agents" raw0 = db.query_one(query0) str0 = "{0}".format(raw0) result0 = str0[1:str0.__len__() - 2] random_line = randint(0, int(result0) - 1) query1 = f"""SELECT agent FROM user_agents LIMIT {random_line}, 1;""" raw1 = db.query_one(query1) str1 = "{0}".format(raw1) result1 = str1[2:str1.__len__() - 3] return result1
def init_tag_table(): with MySQLWrapper() as db: sql = """CREATE TABLE IF NOT EXISTS job_tags ( job_id VARCHAR(100) NOT NULL, tag varchar(100) NOT NULL, FOREIGN KEY (job_id) REFERENCES job (job_id) ON DELETE CASCADE ON UPDATE CASCADE );""" db.execute(sql)
def init_logging_database(): with MySQLWrapper() as db: sql = """CREATE TABLE IF NOT EXISTS crawler_system_logging_table ( crawl_id int NOT NULL, start_time datetime, end_time datetime, in_progress bool, succeeded bool, logging_file_path text );""" db.execute(sql) db.commit()
def init_user_bag_of_words_table(): with MySQLWrapper() as db: sql = """create table if not exists user_bag_of_words_repr ( user_id int NOT NULL, bag_of_words text, CONSTRAINT user_unique UNIQUE (user_id), FOREIGN KEY (user_id) REFERENCES jobseeker (user_id) ON DELETE CASCADE ON UPDATE CASCADE )""" db.execute(sql) db.commit()
def init_doc_matrix(): with MySQLWrapper() as db: sql = """select j.job_id, j.job_title, j.job_description from job j order by j.job_id ASC""" jobs_query_result: Tuple[int, str, str] = db.query_all(sql) for index, one_job in enumerate(jobs_query_result): job_id, job_title, description = one_job all_doc_list.append(description) all_id_list.append(job_id) global all_doc_list_lower all_doc_list_lower = [item.lower() for item in all_doc_list] vectorizer.fit(all_doc_list_lower)
def init_proxy_database(): with MySQLWrapper() as db: sql = """CREATE TABLE IF NOT EXISTS proxies ( ip VARCHAR(100), port VARCHAR(10), location VARCHAR(5), last_checked VARCHAR(50) );""" db.execute(sql) query1 = "DELETE FROM proxies WHERE 1=1" db.execute(query1) db.commit()
def init_job_bag_of_words_table(): with MySQLWrapper() as db: sql = """create table if not exists job_bag_of_words_repr ( job_id BIGINT(11) NOT NULL, job_title text, bag_of_words text, CONSTRAINT job_unique UNIQUE (job_id), FOREIGN KEY (job_id) REFERENCES job (job_id) ON DELETE CASCADE ON UPDATE CASCADE )""" db.execute(sql) db.commit()
def store_interest_job(): with MySQLWrapper() as cursor: query = "DELETE FROM interest_job" cursor.execute(query) cursor.commit() query1 = f"""SELECT user_id FROM jobseeker WHERE personal_summary IS NOT NULL""" cursor.execute(query1) result1 = cursor.fetchall() for row in result1: uid = row[0] job_id_list = jobseeker2job_n_closest_neighbors(uid, 9) for job_id in job_id_list: query2 = f"""INSERT INTO interest_job (user_id, job_id) VALUES ('{uid}', '{job_id}')""" cursor.execute(query2) cursor.commit()
def label_jobseekers(min_freq=1): init_user_bag_of_words_table() with MySQLWrapper() as db: sql = """select j.user_id, j.personal_summary from jobseeker as j inner join user u on j.user_id = u.user_id where j.personal_summary is not null and j.personal_summary <> 'none'""" result: Tuple[int, str, str] = db.query_all(sql) for index, one_row in enumerate(result): user_id, personal_summary = one_row terms = get_all_terms_in_doc_with_frequency(reg_exp, personal_summary, min_freq) insert_one_bag_of_word_repr_of_user(user_id, terms)
def job2job_n_closest_neighbors(job_id: int, n: int) -> List[int]: with MySQLWrapper() as db: sql = f"""select j.job_id, j.bag_of_words from job_bag_of_words_repr j where j.job_id = {job_id}""" result: Tuple[int, str] = db.query_one(sql) _, bag_of_words = result query_list_fre = bag_of_words.split("\n") query_list = [] for one_row in query_list_fre: word, fre = one_row.split("\t") for i in range(int(fre)): query_list.append(word) closest_neighbors = top_n_neighbors(n, query_list) print(closest_neighbors) return closest_neighbors
def get_user_agents(): init_user_agent_database() with MySQLWrapper() as db: url = 'https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/?order_by=-times_seen' response = requests.get(url) parser = fromstring(response.text) query1 = "DELETE FROM user_agents where 1=1" db.execute(query1) db.commit() for i in parser.xpath('//tbody/tr')[:200]: agent = i.xpath('.//td[1]/a[1]/text()')[0] os = i.xpath('.//td[3]/text()')[0] query2 = f"""insert into user_agents values('{agent}', '{os}') """ db.execute(query2) db.commit() print("=====user agents crawler finished=====")
def init_table(category_list: List[str]) -> None: with MySQLWrapper() as db: sql = """CREATE TABLE IF NOT EXISTS job_categories ( category VARCHAR(100) NOT NULL, crawled BOOL DEFAULT FALSE, PRIMARY KEY (category) )""" db.execute(sql) for category in category_list: sql = f"""insert into job_categories(category) values('{category}') """ try: db.execute(sql) except IntegrityError as e: continue db.commit()
def try_proxy(): with MySQLWrapper() as db: while True: tuple_pack = ip_queue.get() if tuple_pack is None: break ip, port, location, last_checked = tuple_pack proxy = ":".join([ip, port]) try_url = "https://media.glassdoor.com/sqlm/1152051/cascade-management-squarelogo-1469741510397.png" content = get_picture_content(try_url, proxy, anonymous=True) # print(text1) if content is None: print(proxy, "failed") else: query2 = f"""insert into proxies values('{ip}', '{port}', '{location}','{last_checked}') """ db.execute(query2) db.commit() print(proxy, "success", "let's add it to database")
def insert_one_bag_of_word_repr_of_job(job_id: int, job_title: str, terms_generator: Generator[str, None, None]): term_frequency_repr = [f"{t}\t{c}" for t, c in terms_generator] term_list_str = "\n".join(term_frequency_repr) with MySQLWrapper() as db: sql = f"""insert into job_bag_of_words_repr(job_id, job_title, bag_of_words) values(%s, %s ,%s) """ vals = (job_id, job_title, term_list_str) try: db.insert_one(sql, vals) except IntegrityError as e: sql = f"""update job_bag_of_words_repr as jbwr set jbwr.bag_of_words = %s where jbwr.job_id = %s""" val = (term_list_str, job_id) db.execute(sql, params=val) db.commit()
def init_user_agent_database(): with MySQLWrapper() as db: sql = """CREATE TABLE IF NOT EXISTS proxies ( ip VARCHAR(100), port VARCHAR(10), location VARCHAR(5), last_checked VARCHAR(50) );""" db.execute(sql) sql = """CREATE TABLE IF NOT EXISTS user_agents ( agent VARCHAR(300), os VARCHAR(20) );""" db.execute(sql) db.commit()
def get_a_proxy(): with MySQLWrapper() as db: query0 = "SELECT count(*) FROM proxies" raw0 = db.query_one(query0) str0 = "{0}".format(raw0) result0 = str0[1:str0.__len__() - 2] random_line = randint(0, int(result0) - 1) query1 = f"""SELECT ip FROM proxies LIMIT {random_line}, 1;""" raw1 = db.query_one(query1) query2 = f"""SELECT port FROM proxies LIMIT {random_line}, 1;""" raw2 = db.query_one(query2) str1 = "{0}".format(raw1) str2 = "{0}".format(raw2) result1 = str1[2:str1.__len__() - 3] result2 = str2[2:str2.__len__() - 3] result = ":".join([result1, result2]) # print(result) return result
def store_behavior_job(): with MySQLWrapper() as cursor: query = "DELETE FROM behavior_job" cursor.execute(query) cursor.commit() query1 = f"""SELECT user_id FROM jobseeker WHERE personal_summary IS NOT NULL""" cursor.execute(query1) result1 = cursor.fetchall() for row in result1: uid = row[0] behavior_job_list = get_list_by_history(uid) job_id_list = top_n_neighbors(6, behavior_job_list) similar_list = manyjobs2job_n_closest_neighbors( get_by_browse_time(20000, uid), 3) job_id_list.extend(similar_list) for job_id in job_id_list: query2 = f"""INSERT INTO behavior_job (user_id, job_id) VALUES ({uid}, {job_id})""" cursor.execute(query2) cursor.commit()