Ejemplo n.º 1
0
def logo_crawler_job_dispatcher(thread_num: int) -> None:
    with MySQLWrapper() as db:
        sql = """select c.company_id, c.logo_url
                    from company_data_unclean as c
                    inner join company_logo_downloaded as c2
                    on c.company_id = c2.company_id
                    where c2.downloaded = false
                    order by c.company_id
                    limit 12
                """
        result = db.query_all(sql)
        for r in result:
            imgs_task_queue.put(r)

    downloading_threads: List[threading.Thread] = []
    for i in range(thread_num):
        thread = threading.Thread(target=save_imgs_task)
        thread.setDaemon(False)
        downloading_threads.append(thread)

    for i in range(thread_num):
        downloading_threads[i].start()

    for i in range(thread_num):
        imgs_task_queue.put(None)

    for thread in downloading_threads:
        thread.join()
Ejemplo n.º 2
0
def save_imgs_task() -> None:
    with MySQLWrapper() as db:
        while True:
            url_task: Optional[Tuple[str, str]] = imgs_task_queue.get()
            if url_task is None:
                break
            company_id, url = url_task
            print(url)
            stored_path = os.path.join(saved_dir, str(company_id) + ".png")
            # print("saving image")
            # get_picture_content(url)
            picture_content = get_picture_content(url, anonymous=True)
            if picture_content is None:
                print(company_id, "failed")
                continue
            with open(stored_path, 'wb') as f:
                f.write(picture_content)
            sql = f"""update company_logo_downloaded as c
                        set c.downloaded = true
                        where c.company_id = {company_id}
                    """

            db.execute(sql)
            logo_path = os.path.join("company", str(company_id) + ".png")
            sql = f"""update company_data_unclean as c
                    set c.logo_path = '{logo_path}'
                    where c.company_id = {company_id}
                    """
            db.execute(sql)
            db.commit()
            print(f"downloaded {company_id} logo")
Ejemplo n.º 3
0
def init():
    with MySQLWrapper() as db:
        sql1 = """CREATE TABLE IF NOT EXISTS company_logo_downloaded
                    (
                      company_id VARCHAR(100) NOT NULL,
                      downloaded  BOOL DEFAULT FALSE,
                      PRIMARY KEY (company_id)
                    )       
                """

        sql2 = """
                insert into company_logo_downloaded(company_id)
                select company_id
                from company_data_unclean;
                """

        db.execute(sql1)
        try:
            db.execute(sql2)
            db.commit()
        except IntegrityError as e:
            pass

    if not os.path.exists(saved_dir):
        os.makedirs(saved_dir)
def record_start_status(crawl_id: int, cur_log_file_path: str):
    start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with MySQLWrapper() as db:
        sql = f"""insert into crawler_system_logging_table
                (crawl_id, start_time, in_progress, logging_file_path,succeeded) 
                values ({crawl_id},'{start_time}',TRUE,'{cur_log_file_path}',FALSE)"""
        db.insert_one(sql)
Ejemplo n.º 5
0
def fetch_uncrawled_job_categories() -> List[str]:
    with MySQLWrapper() as db:
        sql = f"""select j.category from job_categories as j
                  where j.crawled = FALSE
                  order by j.category ASC"""
        job_list_in_tuple: Tuple[str] = db.query_all(sql)
        job_category_list = [job_tuple[0] for job_tuple in job_list_in_tuple]
        return job_category_list
def record_end_status(crawl_id: int):
    end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with MySQLWrapper() as db:
        sql = f"""update crawler_system_logging_table c1
                    set c1.end_time = '{end_time}', c1.succeeded = TRUE, c1.in_progress = FALSE
                    where c1.crawl_id = {crawl_id}"""
        db.execute(sql)
        db.commit()
def current_run_id() -> int:
    with MySQLWrapper() as db:
        sql = """ select max(crawl_id) from crawler_system_logging_table"""
        result = db.query_all(sql)
    if result[0][0] is None:
        return 1
    else:
        return result[0][0] + 1
Ejemplo n.º 8
0
def update_crawled_job_categories(job_category: str) -> None:
    with MySQLWrapper() as db:
        sql = f"""update job_categories as j
                set j.crawled = true
                where j.category= '{job_category}'
            """
        print(sql)
        db.execute(sql)
        db.commit()
Ejemplo n.º 9
0
def shc_helper(col: str, uid: str) -> List[str]:
    with MySQLWrapper() as cursor:
        to_return = []
        query1 = f"""SELECT {col}, COUNT({col}) as count FROM search_history WHERE user_id = '{uid}' AND {col} <> '' GROUP BY {col} ORDER BY count DESC;"""
        cursor.execute(query1)
        result = cursor.fetchall()
        for row in result:
            for i in range(0, row[1]):
                to_return.append(row[0])
        return to_return
Ejemplo n.º 10
0
def clean_company_and_insert():
    with MySQLWrapper() as db:
        table_name = "company"
        db.delete_from(table_name, "1=1")
        sql = """insert into company
                select * from company_data_unclean as c2
                where c2.company_name is not null
                and c2.logo_url is not null"""
        db.execute(sql)
        db.commit()
Ejemplo n.º 11
0
def clean_job_and_insert():
    with MySQLWrapper() as db:
        table_name = "job"
        db.delete_from(table_name, "1=1")
        sql = """insert into job 
                select * from job_data_unclean as j2
                where j2.job_description is not null
                and j2.job_title is not null"""
        db.execute(sql)
        db.commit()
Ejemplo n.º 12
0
def get_by_browse_time(threshold: int, uid: str) -> List[int]:
    with MySQLWrapper() as cursor:
        query1 = f"""SELECT job_id FROM 
        (((SELECT job_id, sum(time_elapsed) as total_time FROM browse_time WHERE user_id = '{uid}' 
        GROUP BY job_id HAVING total_time >= {threshold}) as atable natural join job) natural join company)"""
        cursor.execute(query1)
        result = cursor.fetchall()
        to_return = []
        for row in result:
            to_return.append(row[0])
        return to_return
Ejemplo n.º 13
0
def label_jobs(min_freq=1):
    init_job_bag_of_words_table()
    with MySQLWrapper() as db:
        sql = "select j.job_id, j.job_title, j.job_description from job j"
        result: Tuple[int, str, str] = db.query_all(sql)

    for one_job in result:
        job_id, job_title, description = one_job
        terms = get_all_terms_in_doc_with_frequency(reg_exp, description,
                                                    min_freq)
        insert_one_bag_of_word_repr_of_job(job_id, job_title, terms)
def get_a_useragent():
    with MySQLWrapper() as db:
        query0 = "SELECT count(*) FROM user_agents"
        raw0 = db.query_one(query0)
        str0 = "{0}".format(raw0)
        result0 = str0[1:str0.__len__() - 2]
        random_line = randint(0, int(result0) - 1)
        query1 = f"""SELECT agent FROM user_agents LIMIT {random_line}, 1;"""
        raw1 = db.query_one(query1)
        str1 = "{0}".format(raw1)
        result1 = str1[2:str1.__len__() - 3]
        return result1
Ejemplo n.º 15
0
def init_tag_table():
    with MySQLWrapper() as db:
        sql = """CREATE TABLE IF NOT EXISTS job_tags
                (
                  job_id VARCHAR(100) NOT NULL,
                  tag    varchar(100) NOT NULL,
                  FOREIGN KEY (job_id)
                    REFERENCES job (job_id)
                    ON DELETE CASCADE
                    ON UPDATE CASCADE
                );"""
        db.execute(sql)
def init_logging_database():
    with MySQLWrapper() as db:
        sql = """CREATE TABLE IF NOT EXISTS crawler_system_logging_table
                (
                    crawl_id          int NOT NULL,
                    start_time        datetime,
                    end_time          datetime,
                    in_progress         bool,
                    succeeded         bool,
                    logging_file_path text
                );"""
        db.execute(sql)
        db.commit()
Ejemplo n.º 17
0
def init_user_bag_of_words_table():
    with MySQLWrapper() as db:
        sql = """create table if not exists user_bag_of_words_repr
            (
                user_id      int NOT NULL,
                bag_of_words text,
                CONSTRAINT user_unique UNIQUE (user_id),
                FOREIGN KEY (user_id)
                    REFERENCES jobseeker (user_id)
                    ON DELETE CASCADE
                    ON UPDATE CASCADE
            )"""
        db.execute(sql)
        db.commit()
Ejemplo n.º 18
0
def init_doc_matrix():
    with MySQLWrapper() as db:
        sql = """select j.job_id, j.job_title, j.job_description from job j
                   order by j.job_id ASC"""
        jobs_query_result: Tuple[int, str, str] = db.query_all(sql)

    for index, one_job in enumerate(jobs_query_result):
        job_id, job_title, description = one_job
        all_doc_list.append(description)
        all_id_list.append(job_id)

    global all_doc_list_lower
    all_doc_list_lower = [item.lower() for item in all_doc_list]
    vectorizer.fit(all_doc_list_lower)
Ejemplo n.º 19
0
def init_proxy_database():
    with MySQLWrapper() as db:
        sql = """CREATE TABLE IF NOT EXISTS proxies
                (
                  ip VARCHAR(100),
                  port VARCHAR(10),
                  location VARCHAR(5),
                  last_checked VARCHAR(50)
                );"""
        db.execute(sql)

        query1 = "DELETE FROM proxies WHERE 1=1"
        db.execute(query1)
        db.commit()
Ejemplo n.º 20
0
def init_job_bag_of_words_table():
    with MySQLWrapper() as db:
        sql = """create table if not exists job_bag_of_words_repr
            (
                job_id       BIGINT(11) NOT NULL,
                job_title    text,
                bag_of_words text,
                CONSTRAINT job_unique UNIQUE (job_id),
                FOREIGN KEY (job_id)
                    REFERENCES job (job_id)
                    ON DELETE CASCADE
                    ON UPDATE CASCADE
            )"""
        db.execute(sql)
        db.commit()
Ejemplo n.º 21
0
def store_interest_job():
    with MySQLWrapper() as cursor:
        query = "DELETE FROM interest_job"
        cursor.execute(query)
        cursor.commit()
        query1 = f"""SELECT user_id FROM jobseeker WHERE personal_summary IS NOT NULL"""
        cursor.execute(query1)
        result1 = cursor.fetchall()
        for row in result1:
            uid = row[0]
            job_id_list = jobseeker2job_n_closest_neighbors(uid, 9)
            for job_id in job_id_list:
                query2 = f"""INSERT INTO interest_job (user_id, job_id) VALUES ('{uid}', '{job_id}')"""
                cursor.execute(query2)
                cursor.commit()
Ejemplo n.º 22
0
def label_jobseekers(min_freq=1):
    init_user_bag_of_words_table()
    with MySQLWrapper() as db:
        sql = """select j.user_id, j.personal_summary
                    from jobseeker as j
                             inner join user u
                                        on j.user_id = u.user_id
                    where j.personal_summary is not null
                      and j.personal_summary <> 'none'"""
        result: Tuple[int, str, str] = db.query_all(sql)

    for index, one_row in enumerate(result):
        user_id, personal_summary = one_row
        terms = get_all_terms_in_doc_with_frequency(reg_exp, personal_summary,
                                                    min_freq)
        insert_one_bag_of_word_repr_of_user(user_id, terms)
Ejemplo n.º 23
0
def job2job_n_closest_neighbors(job_id: int, n: int) -> List[int]:
    with MySQLWrapper() as db:
        sql = f"""select j.job_id, j.bag_of_words
                    from job_bag_of_words_repr j
                    where j.job_id = {job_id}"""
        result: Tuple[int, str] = db.query_one(sql)
    _, bag_of_words = result
    query_list_fre = bag_of_words.split("\n")
    query_list = []
    for one_row in query_list_fre:
        word, fre = one_row.split("\t")
        for i in range(int(fre)):
            query_list.append(word)
    closest_neighbors = top_n_neighbors(n, query_list)
    print(closest_neighbors)
    return closest_neighbors
def get_user_agents():
    init_user_agent_database()
    with MySQLWrapper() as db:
        url = 'https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/?order_by=-times_seen'
        response = requests.get(url)
        parser = fromstring(response.text)
        query1 = "DELETE FROM user_agents where 1=1"
        db.execute(query1)
        db.commit()
        for i in parser.xpath('//tbody/tr')[:200]:
            agent = i.xpath('.//td[1]/a[1]/text()')[0]
            os = i.xpath('.//td[3]/text()')[0]
            query2 = f"""insert into user_agents values('{agent}', '{os}') """
            db.execute(query2)
            db.commit()
    print("=====user agents crawler finished=====")
def init_table(category_list: List[str]) -> None:
    with MySQLWrapper() as db:
        sql = """CREATE TABLE IF NOT EXISTS job_categories
            (
              category VARCHAR(100) NOT NULL,
              crawled  BOOL DEFAULT FALSE,
              PRIMARY KEY (category)
            )"""
        db.execute(sql)

        for category in category_list:
            sql = f"""insert into job_categories(category) values('{category}') """
            try:
                db.execute(sql)
            except IntegrityError as e:
                continue
        db.commit()
Ejemplo n.º 26
0
def try_proxy():
    with MySQLWrapper() as db:
        while True:
            tuple_pack = ip_queue.get()
            if tuple_pack is None:
                break
            ip, port, location, last_checked = tuple_pack
            proxy = ":".join([ip, port])
            try_url = "https://media.glassdoor.com/sqlm/1152051/cascade-management-squarelogo-1469741510397.png"
            content = get_picture_content(try_url, proxy, anonymous=True)
            # print(text1)
            if content is None:
                print(proxy, "failed")
            else:
                query2 = f"""insert into proxies values('{ip}', '{port}', '{location}','{last_checked}') """
                db.execute(query2)
                db.commit()
                print(proxy, "success", "let's add it to database")
Ejemplo n.º 27
0
def insert_one_bag_of_word_repr_of_job(job_id: int, job_title: str,
                                       terms_generator: Generator[str, None,
                                                                  None]):
    term_frequency_repr = [f"{t}\t{c}" for t, c in terms_generator]
    term_list_str = "\n".join(term_frequency_repr)
    with MySQLWrapper() as db:
        sql = f"""insert into job_bag_of_words_repr(job_id, job_title, bag_of_words) 
        values(%s, %s ,%s) """
        vals = (job_id, job_title, term_list_str)
        try:
            db.insert_one(sql, vals)
        except IntegrityError as e:
            sql = f"""update job_bag_of_words_repr as jbwr
                                set jbwr.bag_of_words = %s
                                where jbwr.job_id = %s"""
            val = (term_list_str, job_id)
            db.execute(sql, params=val)
            db.commit()
def init_user_agent_database():
    with MySQLWrapper() as db:
        sql = """CREATE TABLE IF NOT EXISTS proxies
                    (
                      ip VARCHAR(100),
                      port VARCHAR(10),
                      location VARCHAR(5),
                      last_checked VARCHAR(50)
                    );"""
        db.execute(sql)

        sql = """CREATE TABLE IF NOT EXISTS user_agents
                (
                  agent VARCHAR(300),
                  os VARCHAR(20)
                );"""
        db.execute(sql)
        db.commit()
Ejemplo n.º 29
0
def get_a_proxy():
    with MySQLWrapper() as db:
        query0 = "SELECT count(*) FROM proxies"
        raw0 = db.query_one(query0)
        str0 = "{0}".format(raw0)
        result0 = str0[1:str0.__len__() - 2]
        random_line = randint(0, int(result0) - 1)
        query1 = f"""SELECT ip FROM proxies LIMIT {random_line}, 1;"""
        raw1 = db.query_one(query1)
        query2 = f"""SELECT port FROM proxies LIMIT {random_line}, 1;"""
        raw2 = db.query_one(query2)
        str1 = "{0}".format(raw1)
        str2 = "{0}".format(raw2)
        result1 = str1[2:str1.__len__() - 3]
        result2 = str2[2:str2.__len__() - 3]
        result = ":".join([result1, result2])
        # print(result)
        return result
Ejemplo n.º 30
0
def store_behavior_job():
    with MySQLWrapper() as cursor:
        query = "DELETE FROM behavior_job"
        cursor.execute(query)
        cursor.commit()
        query1 = f"""SELECT user_id FROM jobseeker WHERE personal_summary IS NOT NULL"""
        cursor.execute(query1)
        result1 = cursor.fetchall()
        for row in result1:
            uid = row[0]
            behavior_job_list = get_list_by_history(uid)
            job_id_list = top_n_neighbors(6, behavior_job_list)
            similar_list = manyjobs2job_n_closest_neighbors(
                get_by_browse_time(20000, uid), 3)
            job_id_list.extend(similar_list)
            for job_id in job_id_list:
                query2 = f"""INSERT INTO behavior_job (user_id, job_id) VALUES ({uid}, {job_id})"""
                cursor.execute(query2)
                cursor.commit()