def parse_category(url, category): page = requests.get(url).content soup = BeautifulSoup(page) # all_jobs = soup.findAll('td', class_='left') all_jobs = soup.findAll('tr') for job in all_jobs: url = 'https://freelancehunt.com' + job.find('a').attrs['href'] title = job.find('a').text text = job.find('a').attrs['title'] date = job.attrs['data-published'] # from IPython import embed; embed() try: price = job.find('div', class_='price').text.split('\n')[1] except AttributeError: price = None # print(job) print('\nDate:', date,\ '\nTitle:', title,\ '\nText:', text,\ '\nPrice:', price,\ '\nURL:', url, '\n\n' ) if not job_exist(url): job_row = Job(title=title, date=date, price=price, url=url, category=category, parse_date=datetime.now()) session.add(job_row) session.commit()
def parse_category(url, category): page = urllib2.urlopen(url, context=ctx) soup = BeautifulSoup(page) all_jobs = soup.findAll('div', {'class': 'jobsearch-result-list'}) for job in all_jobs: title = job.find('a', {'style': 'color: #000;'}).text print title date_raw = job.find('div', {'class': 'col-xs-6 col-md-2 col-lg-2 lefttop'}) date = date_raw.find('b').text.split()[0] print date price = job.find('div', { 'class': 'col-xs-6 col-md-2 col-lg-2 leftbottom' }).text.split()[2] print price url = 'http://www.freelance.com' + job.find('a', { 'style': 'color: #000;' }).get('href') print url if not job_exist(url): job_row = Job(title=unicode(title), date=unicode(date), price=price, url=url, category=category, parse_date=datetime.now()) session.add(job_row) session.commit()
def parse_category(url, category): page = urllib2.urlopen(url) soup = BeautifulSoup(page) all_jobs = soup.findAll('article', {"class": "task task_list"}) for job in all_jobs: # print job title = job.find("div", "task__title").text print "Title:\t", title url = 'http://freelansim.ru'+job.find("div", "task__title").find("a").get('href') print "Url:\t", url if not job_exist(url): date = job.find("span", "params__published-at").text.splitlines() date = str(date[0]) print "Date:\t", date price_raw = job.find("div", "task__price") price = price_raw.find("span", "count") if price: price = price.text else: # if not exist, find another tag price = price_raw.find("span", "negotiated_price").text print "Price:\t", price #raw = job.contents # category = 'admin' text_page = urllib2.urlopen(url) text_soup = BeautifulSoup(text_page) text = text_soup.find('div', {'class': 'task__description'}).text text_length = 320 text = (text[:text_length] + '..') if len(text) > text_length else text print text, "\n" job_row = Job( title = unicode(title), date = unicode(date), price = price, url = url, category = category, parse_date = datetime.now(), description = text ) session.add(job_row) session.commit()
def parse_category(url, category): page = urllib2.urlopen(url, context=ctx) soup = BeautifulSoup(page) all_jobs = soup.findAll('div', {'class': 'jobsearch-result-list'}) for job in all_jobs: # print job title = job.find('a', {'style': 'color: #000;'}).text print title url = 'http://www.freelance.com' + job.find('a', { 'style': 'color: #000;' }).get('href') print url if not job_exist(url): date_raw = job.find( 'div', {'class': 'col-xs-6 col-md-2 col-lg-2 lefttop'}) date = date_raw.find('b').text.split()[0] print date price = job.find('div', { 'class': 'col-xs-6 col-md-2 col-lg-2 leftbottom' }).text.split()[2] print price text_page = urllib2.urlopen(url, context=ctx) text_soup = BeautifulSoup(text_page) text = text_soup.find('div', { 'class': 'col-md-9 col-lg-9 description' }).text[11:] # text = job.find('div', { 'class': 'col-xs-12 col-md-4 col-lg-4 center'}).text text_length = 320 text = (text[:text_length] + '..') if len(text) > text_length else text print text print '=========\n\n' job_row = Job(title=unicode(title), date=unicode(date), price=price, url=url, category=category, parse_date=datetime.now(), description=text) session.add(job_row) session.commit()
def parse_category(url, category): page = requests.get(url).content soup = BeautifulSoup(page, 'lxml') all_jobs = soup.findAll('div', {'class': 'row'}) for job in all_jobs: title_raw = job.find('div', class_='col-sm-10') try: # to get Title title = title_raw.find('h2').text url = 'https://www.weblancer.net' + title_raw.find('a').attrs['href'] if not job_exist(url): text = title_raw.find('p').text try: # to get date date = job.find('span', class_='time_ago').attrs['data-timestamp'] # from IPython import embed; embed(); import sys; sys.exit() except AttributeError: date = job.find('span', class_='time_ago').attr['data-timestamp'] print(date) try: # to get price price = job.find('div', class_='amount').text except AttributeError: price = None print('\nDate:', date, \ '\nTitle:', title, \ '\nText:', text, \ '\nPrice:', price, \ '\nURL:', url) job_row = Job( title=title, date=date, price=price, url=url, category=category, parse_date=datetime.now(), description=text ) session.add(job_row) session.commit() else: print(title) except: pass
def parse_category(url, category): page = requests.get(url).content soup = BeautifulSoup(page, "html.parser") all_jobs = soup.findAll('article', {"class": "task task_list"}) for job in all_jobs: title = job.find("div", "task__title").text url = 'http://freelance.habr.com' + job.find( "div", "task__title").find("a").get('href') if not job_exist(url): date = job.find("span", "params__published-at").text.splitlines() date = str(date[0]) price_raw = job.find("div", "task__price") try: price = price_raw.find("span", "count").text except: price = price_raw.find("span", "negotiated_price").text text_page = requests.get(url).content text_soup = BeautifulSoup(text_page, "html.parser") text = text_soup.find('div', {'class': 'task__description'}).text text_length = 320 text = (text[:text_length] + '..') if len(text) > text_length else text #print(text, "\n") job_row = Job(title=title, date=date, price=price, url=url, category=category, parse_date=datetime.now(), description=text) session.add(job_row) session.commit()
def parse_category(url, category): page = requests.get(url).content soup = BeautifulSoup(page, "html.parser") all_jobs = soup.find_all('tr') for job in all_jobs: a = job.find('a') title = a.text.strip() url = 'https://freelancehunt.com' + a.attrs['href'] if not job_exist(url): text = job.find('p', {"style": "word-break: break-word"}).text.strip() date = int(job.attrs['data-published']) try: price = job.find('div', class_='text-green price with-tooltip').text.strip() except AttributeError: price = None #print('\nDate:', date,\ # '\nTitle:', title,\ # '\nText:', text,\ # '\nPrice:', price,\ # '\nURL:', url, '\n\n' #) job_row = Job( title=title, date=date, price=price, url=url, category=category, parse_date=datetime.now(), description=text ) session.add(job_row) session.commit()
def parse_category(url, category): page = urllib2.urlopen(url) soup = BeautifulSoup(page) all_jobs = soup.findAll('article', {"class": "task task_list"}) for job in all_jobs: title = job.find("div", "task__title").text print "Title:\t", title url = 'http://freelansim.ru'+job.find("div", "task__title").find("a").get('href') print "Url:\t", url date = job.find("span", "params__published-at").text.splitlines() date = str(date[0]+' '+date[1]) print "Date:\t", date price_raw = job.find("div", "task__price") price = price_raw.find("span", "count") if price: price = price.text else: # if not exist, find another tag price = price_raw.find("span", "negotiated_price").text print "Price:\t", price, "\n" #raw = job.contents # category = 'admin' if not job_exist(url): job_row = Job( title = unicode(title), date = unicode(date), price = price, url = url, category = category, parse_date = datetime.now() ) session.add(job_row) session.commit()
def parse_category(url, category): page = requests.get(url).content soup = BeautifulSoup(page, 'html.parser') all_jobs = soup.find_all('div', class_='row click_container-link set_href') for job in all_jobs: right = job.find('div', class_='col-sm-4 text-sm-right').find('span') try: if right.text.startswith('Закрыт'): continue except: pass a = job.find('div', class_='title').find('a') title = a.text.strip() url = 'https://www.weblancer.net' + a.attrs['href'] if not job_exist(url): try: date = int( right.find('span', class_='time_ago').attrs['data-timestamp']) except: date = '' try: text = " ".join( job.find('div', class_='collapse').text.strip().split(" ")[:-1]) except AttributeError: text = job.find('div', class_='text_field text-inline').text.strip() text = text.replace("\n", " ") try: price = job.find( 'div', class_='float-right float-sm-none title amount indent-xs-b0' ).find('span').text.strip() except AttributeError: price = None #print('\nDate:', date, \ # '\nTitle:', title, \ # '\nText:', text, \ # '\nPrice:', price, \ # '\nURL:', url #) job_row = Job(title=title, date=date, price=price, url=url, category=category, parse_date=datetime.now(), description=text) session.add(job_row) session.commit()