コード例 #1
0
ファイル: main.py プロジェクト: oreoTaste/python-job-scrapper
def search():
    region1 = request.args.get("region1")
    region2 = request.args.get("region2")
    items = get_danggn_items(region1=region1, region2=region2)

    save_to_csv(items, filename="danggn.csv")
    return render_template("danggn.html", items=items, filename="danggn.csv")
コード例 #2
0
def save_file():
    try:
        word = request.args.get("word")
        if not word:
            raise Exception()
        word = word.lower()
        jobs = db.get(word)
        if not jobs:
            raise Exception()
        save.save_to_csv(jobs)
        return send_file("jobs.csv", mimetype="text/csv", attachment_filename=f"{word}.csv", as_attachment=True)
    except:
        return redirect("/")
コード例 #3
0
        '-f',
        '--filename',
        type=str,
        default=FILE_DEFAULT,
        help=f"File to save to. Default name is {FILE_DEFAULT}",
    )
    return new_parser


if __name__ == '__main__':
    parser = create_argparser()
    args = parser.parse_args()

    file_path = os.path.abspath(args.filename)

    logger.info(f"Starting poller version {__version__}.")
    logger.info(f"Time between polls: {args.time} seconds")
    logger.info(f"Saving to file: {file_path}")

    while True:
        logger.info("Scraping data...")
        scraped_data = scrape.scrape_inara_cgs()
        ongoing_cgs = [
            data for data in scraped_data if data['status'] == 'Ongoing'
        ]
        logger.info("Saving data...")
        save.save_to_csv(file_path, ongoing_cgs)

        logger.info(f"Complete. Next poll in {args.time} seconds.")

        time.sleep(int(args.time))
コード例 #4
0
ファイル: main.py プロジェクト: Polinavas95/vacancies_parser
from head_hunter import get_all_pages_hh, get_elements_from_hh
from so import get_all_pages_so, get_elements_from_so
from save import save_to_csv

# Методы получения всех вакансий с каждой страницы
hh_jobs = get_elements_from_hh(get_all_pages_hh())
so_jobs = get_elements_from_so(get_all_pages_so())

all_jobs = hh_jobs + so_jobs

save_to_csv(all_jobs)
コード例 #5
0
from base import extract_pages, extract_jobs
from save import save_to_csv

last_page = extract_pages()

jobs = extract_jobs(last_page)

save_to_csv(jobs)
コード例 #6
0
from webdev import get_posts
from save import save_to_csv

posts = get_posts()
save_to_csv(posts)
コード例 #7
0
ファイル: main.py プロジェクト: jvictorss/scrapingmp
import gspread
from indeed import search_ideed
from stackoverflow import search_so
from save import save_to_csv

#busca
search = 'python'

#salva resultado do indeed
result_indeed = search_ideed(search)
#salva resultado do indeed
result_so = search_so(search)

#junta os resultados em all_results
all_resuts = result_indeed + result_so

#envia para salvar no csv
save_to_csv(all_resuts)

#enviar os dados para a planilha
spreadsheetId = '1uRDAuGudRxYx77JH4b1wnYNflaFo0cemk3bG3ocTh_A'
gc = gspread.service_account(filename='credentials.json')
sh = gc.open_by_key(spreadsheetId)
worksheet = sh.sheet1

csvFile = 'jobs.csv'
sheetName = 'CSV'

content = open('jobs.csv', 'r').read()
gc.import_csv(spreadsheetId, content.encode('utf-8'))
コード例 #8
0
links = ranklist.find_all('li')

#dict
totalRankList = []

for link in links[:-1]:
    rankDict = {}
    rankDict['rank'] = link.find("span", {"class": "num"}).contents[0]
    rankDict['name'] = link.find("a", {
        "class": "coLink"
    }).find("b").contents[0]
    rankDict['url'] = 'www.jobkorea.co.kr' + link.find("a", {
        "class": "link"
    }).attrs['href']
    rankDict['title'] = link.find("a", {
        "class": "link"
    }).find("span").contents[0]
    categoryLists = link.find("div", {"class": "sTit"}).find_all("span")
    categoryContents = []
    for categoryList in categoryLists:
        categoryContents.append(categoryList.contents[0])
    rankDict['category'] = categoryContents
    # rankDict['detail']

    rankDict['endDay'] = link.find("span", {"class": "day"}).contents[0]
    totalRankList.append(rankDict)

save_to_csv(totalRankList)
save_to_json(totalRankList)
コード例 #9
0
def get_jobs(page, city, job_type):

    Chrome_driver = webdriver.Chrome(options=options)
    c_code = city_code[city]
    for i in range(1, page + 1):

        try:
            print("正在抓取第 %s 页数据" % i)
            uri = '/%s/?query=%s&page=%s' % (c_code, job_type, i)
            Chrome_driver.get(config.url + uri)
            time.sleep(2)
            job_dict = {}
            if i == 1:
                jobs = Chrome_driver.find_element_by_xpath(
                    '//*[@id="main"]/div/div[3]/ul')
                jobs_list = jobs.find_elements_by_tag_name('li')
                for job in range(1, len(jobs_list) + 1):
                    job_details = Chrome_driver.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[%i]/div/div[1]/h3' %
                        job)
                    job_details_uri = job_details.find_element_by_tag_name(
                        'a').get_attribute('href')
                    job_details_name = job_details.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[%i]/div/div[1]/h3/a/div[1]'
                        % job).text
                    job_details_salary = job_details.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[%i]/div/div[1]/h3/a/span'
                        % job).text

                    job_company = Chrome_driver.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[%i]/div/div[2]/div/h3'
                        % job).text
                    details = Chrome_driver.find_element_by_xpath(
                        '//*[@id="main"]/div/div[3]/ul/li[%i]/div/div[1]/p' %
                        job).get_attribute('outerHTML')
                    job_rege = re.match(rege, details)
                    job_dict['company_name'] = job_company
                    job_dict['uri'] = job_details_uri
                    job_dict['salary'] = job_details_salary
                    try:
                        job_dict['site'] = job_rege.group(1)
                        job_dict['year'] = job_rege.group(2)
                        job_dict['edu'] = job_rege.group(3)
                    except:
                        continue
                    job_dict['job_name'] = job_details_name
                    job_dict['city'] = city
                    job_dict['job_type'] = job_type

                    # save data
                    try:
                        save_to_csv(job_dict, city)
                    except:
                        raise
                    time.sleep(1)
                    print(job_dict)
            else:
                jobs = Chrome_driver.find_element_by_xpath(
                    '//*[@id="main"]/div/div[2]/ul')
                jobs_list = jobs.find_elements_by_tag_name('li')
                for job in range(1, len(jobs_list) + 1):
                    job_details = Chrome_driver.find_element_by_xpath(
                        '//*[@id="main"]/div/div[2]/ul/li[%i]/div/div[1]/h3' %
                        job)
                    job_details_uri = job_details.find_element_by_tag_name(
                        'a').get_attribute('href')
                    job_details_name = job_details.find_element_by_xpath(
                        '//*[@id="main"]/div/div[2]/ul/li[%i]/div/div[1]/h3/a/div[1]'
                        % job).text
                    job_details_salary = job_details.find_element_by_xpath(
                        '//*[@id="main"]/div/div[2]/ul/li[%i]/div/div[1]/h3/a/span'
                        % job).text

                    job_company = Chrome_driver.find_element_by_xpath(
                        '//*[@id="main"]/div/div[2]/ul/li[%i]/div/div[2]/div/h3'
                        % job).text
                    details = Chrome_driver.find_element_by_xpath(
                        '//*[@id="main"]/div/div[2]/ul/li[%i]/div/div[1]/p' %
                        job).get_attribute('outerHTML')
                    job_rege = re.match(rege, details)
                    job_dict['company_name'] = job_company
                    job_dict['uri'] = job_details_uri
                    job_dict['salary'] = job_details_salary
                    try:
                        job_dict['site'] = job_rege.group(1)
                        job_dict['year'] = job_rege.group(2)
                        job_dict['edu'] = job_rege.group(3)
                    except:
                        continue
                    job_dict['job_name'] = job_details_name

                    job_dict['city'] = city
                    job_dict['job_type'] = job_type

                    # save data
                    try:
                        save_to_csv(job_dict, city)
                    except:
                        raise
                    time.sleep(1)
                    print(job_dict)
        except:
            raise
    Chrome_driver.close()
    time.sleep(3)