def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") meta = page.select('.head')[0] headline = meta.h1.text.strip() # Вижте 50-те най-четени мнения в сайта ни за годината if headline == '': continue info = clean_text( meta.select('.article-date')[0].text.split('(')[0]) if len( meta.select('.article-date')) > 0 else '' # 30.12.2019 10:33 articleDate = info.split(';')[0] if info != '' else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name) ) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d.%m.%Y %H:%M') author = info.split(';')[1] if ';' in info else None views = requests.get( 'https://www.24chasa.bg/Article/{id}/4'.format( id=re.search('(\d+)$', link).group(1))).text article_text = ' '.join([ clean_text(par.text) for par in page.select('.content')[0].select('p') ]).split('Tweet')[0] if len( page.select('.content')) > 0 else '' # shares - will need selenium for that # shares = page.select('.inlineBlock')[1].select('.span')[-1].text articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'author': clean_text(author), 'date': articleDate, #'shares': shares, 'views': clean_text(views), 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def prepare_request_fields(results): if current_user.is_anonymous(): return map(lambda r: { "id": r.id, \ "text": helpers.clean_text(r.text), \ "date_received": helpers.format_datetime(r.date_received or r.date_created, '%b %d, %Y at %-I:%M %p'), \ "department": r.department_name(), \ "status": r.status, \ # The following two attributes are defined as model methods, # and not regular SQLAlchemy attributes. "contact_name": r.point_person_name(), \ "solid_status": r.solid_status() }, results) else: return map(lambda r: { "id": r.id, \ "text": helpers.clean_text(r.text), \ "date_received": helpers.date(r.date_received or r.date_created), \ "department": r.department_name(), \ "requester": r.requester_name(), \ "due_date": format_date(r.due_date), \ "status": r.status, \ # The following two attributes are defined as model methods, # and not regular SQLAlchemy attributes. "contact_name": r.point_person_name(), \ "solid_status": r.solid_status() }, results)
def gather_new_articles(site): request = requests.get(site) soup = bs4.BeautifulSoup(request.text, features="html.parser") all_articles = list( set(soup.findAll('a', attrs={'href': re.compile('^/.*\.html$')}))) #import pdb; pdb.set_trace() articles_content = pd.DataFrame() for a in all_articles: try: title = a['title'] link = site + a['href'] comments = a.find('span', attrs={ 'class': 'cmc' }).text if a.find('span', attrs={'class': 'cmc'}) else '' views = a.find('span', attrs={ 'class': 'cmv' }).text if a.find('span', attrs={'class': 'cmv'}) else '' date = a.find('span', attrs={ 'class': 'cmd' }).text if a.find('span', attrs={'class': 'cmd'}) else '' desc = a.find('span', attrs={ 'class': 'short-desc' }).text if a.find('span', attrs={'class': 'short-desc'}) else '' articles_content = articles_content.append( { 'link': link, 'title': clean_text(title), 'comments': clean_text(comments), 'views': clean_text(views), 'category': re.search('frognews\.bg//(\w+)', link).group(1) if re.search('frognews\.bg//(\w+)', link) else '', 'date': clean_text(date), 'subtitle': clean_text(desc) }, ignore_index=True) except: continue return articles_content
def get_desc(page): desc_start_phrase = '<b>Допълнителна информация:</b><br/>' desc_end_phrase = '<' desc_start_ind = str(page).find(desc_start_phrase) + len(desc_start_phrase) desc_end_ind = str(page).find(desc_end_phrase, desc_start_ind) desc = str(page)[desc_start_ind:desc_end_ind] if desc_start_ind > 100 else '' return clean_text(desc)
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") headline = page.select('.content')[0].h1.text.strip() meta = clean_text(page.select('.article-tools')[0].text) if len(page.select('.article-tools')) > 0 else '' # 14:41, 30 дек 19 articleDate = re.search('(.*),', meta).group(1) if re.search('(.*),', meta) is not None else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group(1) if month_name is not None else None articleDate = articleDate.replace(month_name, replace_month_with_digit( month_name)) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%H:%M, %d %m %y') views = re.search('(\d+)$', meta).group(1) if re.search('(\d+)$', meta) is not None else '' comments = page.select('.comments')[0].text.strip() if len(page.select('.comments')) > 0 else '' article_body = page.select('.article-content')[0].select('p') if len(page.select('.article-content')) > 0 else '' if article_body != '': author = article_body[0].text article_text = ' '.join([clean_text(par.text) for par in article_body[1:] if '<' not in par.text]) article_text = article_text[article_text.find('}') + 1:].strip() else: article_text = '' author = '' tags = ' - '.join( [clean_text(tag.text) for tag in page.select('.tags')[0].select('li') if tag != ',' and tag != "\n"]) \ if len(page.select('.tags')) > 0 else None tags = clean_text(tags) articles_content = articles_content.append({'link': link, 'title': clean_text(headline), 'author': clean_text(author), 'date': articleDate, 'category': category, 'comments': clean_text(comments), 'views': clean_text(views), 'tags': tags, 'article_text': article_text}, ignore_index=True) except: continue return articles_content
def __iter__(self): for fname in self.fnames: with open('text/hemingway/{}'.format(fname), 'rb') as f: # use 'rb' to avoid decode attempt on read, which would stop # at unrecognizable characters raw = f.read().decode('utf-8', 'ignore') # immediately decode to string word_count = 0 for sentence in raw.split('.'): # split on periods between sentences word_count += len(sentence.split()) words = nltk.word_tokenize(clean_text(sentence)) yield words # yield sentence as list of words print('TRAINING ON A CORPUS OF {} WORDS'.format(word_count))
def crawlLinks(links): articlesContent = pd.DataFrame() for link, section in tqdm(list(links)): try: rq = requests.get(link) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") articleTitle = page.select('h1')[0].text if len( page.select('h1')) > 0 else '' articleSubtitle = page.select('h2.subtitle')[0].text if len( page.select('h2.subtitle')) > 0 else '' articleDate = page.select( '.article-time')[0].text.split(', oбновена')[0] if len( page.select('.article-time')) > 0 else '' articleDate = clean_text(articleDate) month_name = re.search('([а-яА-Я]+)', articleDate) if month_name is not None: month_name = month_name.group(1) articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name)) articleDate = pd.to_datetime(articleDate, format='%d %m %Y, %H:%M') category = page.select( 'div.article-category')[0].a.text if len( page.select('div.article-category')) > 0 else '' comments = page.select('.commentsButtonNumber')[0].text if len( page.select('.commentsButtonNumber')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in page.select('.article-text')[0].select('p') ]) # article-tags tags = page.select('.article-tags') tags = ' - '.join([ clean_text(tag.text) for tag in tags[0].select('a') ]) if tags is not None else None articlesContent = articlesContent.append( { 'link': link, 'section': section, 'comments': clean_text(comments), 'title': clean_text(articleTitle), 'subtitle': clean_text(articleSubtitle), 'date': articleDate, 'category': category, 'tags': tags, 'article_text': article_text }, ignore_index=True) except: continue return articlesContent
data = [] if YOUTUBE_CHANNEL_ID: data = get_comments(YOUTUBE_CHANNEL_ID) if TWITTER_HASHTAG: data.extend(get_tweets(TWITTER_HASHTAG, 'hashtag_search')) if TWITTER_REPLIES: data.extend(get_tweets('to:' + TWITTER_REPLIES, 'replies')) #else: # sys.exit('Provide exactly one search parameter') #print(data) comment_type, links, posts, sentiment, magnitude, date, likes, shares, author_follower_count, author_country, topic = ( [] for _ in range(11)) for index, tweet in enumerate(data): #print(tweet) res = analyze_post(clean_text(tweet['text'])) comment_type.append(tweet['type']) shares.append(tweet['shares']) posts.append(tweet['text']) date.append(tweet['created_at']) likes.append(tweet['likes']) links.append(tweet['link']) author_country.append(tweet['country']) author_follower_count.append(tweet['author_follower_count']) try: sentiment.append(round(res[0], 2)) magnitude.append(round(res[1], 2)) topic.append(res[2]) print(index) except Exception as e: print('append sentiment to list', e)
def crawlLinks(links): articlesContent = pd.DataFrame() for link in tqdm(list(links)): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") if page.find({'class': 'article-post'}): body = page.select('.article-post')[0] headline = body.select('h1')[0].text if len(body.select('h1')) else '' subtitle = None #metadata location = body.select('.location')[0].text if len(body.select('.location')) else '' articleDate = body.select('.fa-calendar')[0].text if len(body.select('.fa-calendar')) else '' views = body.select('.fa-eye')[0].text if len(body.select('.fa-eye')) else '' comments = body.select('.fa-comments-o')[0].text if len(body.select('.fa-comments-o')) else '' comments = comments.split(" ")[0] if comments != '' else '' tags = ' - '.join([tag['a'].text for tag in body.select('.tags').select('li')]) else: headline = page.select('.post-title')[0].text if len(page.select('.post-title')) else '' subtitle = page.select('.post-subtitle')[0].text if len(page.select('.post-subtitle')) else '' #metadata simpleShare = page.select('.simple-share')[0] if len(page.select('.simple-share')) > 0 else '' li = simpleShare.find_all('li') location = li[0].text if len(li) > 0 else '' articleDate = li[1].text if len(li) > 1 else '' views = li[2].text if len(li) > 2 else '' views = views.split(" ")[0] if views != '' else '' comments = li[3].text if len(li) > 3 else '' comments = comments.split(" ")[0] if comments != '' else '' tags = ' - '.join([tag.a.text for tag in page.select('.tags-widget')[0].select('li')[1:]]) if len(page.select('.tags-widget')) > 0 else '' # 30 Дек. 2019, 16:13 if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) if month_name is not None: month_name = month_name.group(1) articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) articleDate = pd.to_datetime(articleDate, format='%d %m %Y, %H:%M') article_text = clean_text(page.select('.post-content')[0].select('div')[2].text) if len(page.select('.post-content')) > 0 else '' articlesContent = articlesContent.append({'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'location': clean_text(location), 'comments': clean_text(comments), 'date': articleDate, 'views': clean_text(views), 'category': category, 'tags': clean_text(tags), 'article_text': article_text}, ignore_index=True) except: continue return articlesContent
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") category = page.find( 'div', attrs={ 'class': 'printing_large_text_toolbar' }).text if page.find( 'div', attrs={'class': 'printing_large_text_toolbar' }) is not None else '' headline = page.select('#news_heading')[0].h1.text.strip( ) if len(page.select('#news_heading')) > 0 else '' shares = page.select(".social_count")[0].text.strip() if len( page.select(".social_count")) > 0 else '' comments = page.select('.comments')[0].text.strip() if len( page.select('.comments')) else '' views = page.select( '.btn_reads')[0].text.split('Прочетена')[1].strip() if len( page.select('.btn_reads')) > 0 else '' article_text = clean_text( page.select('#news_content')[0].text) if len( page.select('#news_content')) > 0 else '' # 01 януари 2020 | 16:26 - Обновена articleDate = page.find('td', attrs={'id': 'news_heading'}) articleDate = articleDate.find( 'span', attrs={ 'class': 'dark_text' }).text if articleDate is not None and articleDate.find( 'span', attrs={'class': 'dark_text' }) is not None else '' articleDate = articleDate.split( '- Обновена')[0].strip() if articleDate != '' else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name) ) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d %m %Y | %H:%M') author = page.select( '#author_box')[0].select('h5')[0].a.text if len( page.select('#author_box')) > 0 else '' tags = " - ".join([ clean_text(i.text) for i in page.find('div', attrs={ 'class': 'news_tags' }).findAll('span') ]) articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'author': clean_text(author), 'date': articleDate, 'category': clean_text(category), 'tags': tags, 'comments': clean_text(comments), 'views': clean_text(views), 'shares': clean_text(shares), 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") info = page.select('#news_details')[0] if len( page.select('#news_details')) > 0 else '' headline = info.h1.text.strip() subtitle = info.h2.text.strip() meta = info.select('.info')[0].select('div') # 30 Декември, 2019 15:26 articleDate = meta[0].text.split('Публикувана:')[1].strip() month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit( month_name)) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d %m, %Y %H:%M') meta = meta[1].text.strip() if len(meta) > 0 else '' if meta != '': comments = re.search('(^\d+)', meta).group(1) views = re.search('(\d+)$', meta).group(1) author = page.select('.linksProfile')[0].text if len( page.select('.linksProfile')) > 0 else '' article_body = page.select('#news_content')[0].select( 'p') if len(page.select('#news_content')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in article_body if '<' not in par.text ]) if article_body != '' else '' tags = ' - '.join( [clean_text(tag.text) for tag in page.select('.tags')[0].select('a') if tag != ',' and tag != "\n"]) \ if len(page.select('.tags')) > 0 else None #shares # shares = page.select('.inlineBlock')[1].select('.span')[-1].text articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'author': clean_text(author), 'date': articleDate, 'category': category, 'comments': clean_text(comments), # 'shares': shares, 'views': clean_text(views), 'tags': tags, 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def get_all_offers(search_pages): offers = pd.DataFrame() options = Options() options.headless = True options.add_argument('log-level=3') browser = webdriver.Chrome(ChromeDriverManager().install(), options=options) #import pdb; pdb.set_trace() for p in tqdm(search_pages): browser.get(p) page = bs4.BeautifulSoup(browser.page_source, features='html.parser') #resp = requests.get(p) #page = bs4.BeautifulSoup(resp.content.decode('cp1251'), 'html') boxes = page.find_all('div', attrs={'class': 'items'})[0].findAll('item') for b in boxes: try: link = b.find_all('text')[0].find_all('div', attrs={'class': 'title' })[0].a['href'] title = b.find_all('text')[0].find_all( 'div', attrs={'class': 'title'})[0] data = b.find_all('text')[0].find_all('div', attrs={'class': 'data'})[0].text info = b.find_all('text')[0].find_all('div', attrs={'class': 'info'})[0] id = re.search('adv=(.*)$', link).group(1) place, labels = get_place_and_labels( clean_text(title.a.text.replace('град София,', ''))) area = re.search( '(^[^А-Яа-я\.]*)', data.split(',')[1].replace(' ', '')).group( 1) if len(data.split(',')) > 1 and re.search( '(^[^А-Яа-я\.]*)', data.split(',')[1].replace(' ', '')) else '0' price = clean_text(title.find_all('span')[0].text) price_orig = price price = re.search('([\d\s]+)', price).group(1).replace( ' ', '') if re.search('([\d\s]+)', price) else '0' if 'Цена при запитване' in price_orig: price = '0' elif 'eur' in price_orig.lower(): currency = 'EUR' elif 'лв' in price_orig.lower(): price = str(round(float(price) / 1.9558)) if price != '0' else '0' currency = 'BGN' if 'на кв.м' in price_orig: #print('\n{} * {} = {}'.format(float(price), float(area), round(float(price) * float(area), 0))) price = round(float(price) * float(area), 0) typ = clean_text(data.split(',')[0]) agency = clean_text( info.a['href']) if len(info.find_all('a')) > 0 else '' offers = offers.append( { 'link': sale_url + link, 'id': id, 'type': typ, 'place': place, 'price': price, 'area': area, 'labels': labels, 'description': clean_text(info.text), 'currency': currency, 'agency': agency }, ignore_index=True) except Exception as e: print(e) return offers
def crawlLinks(): offers = pd.DataFrame() for city in cities: resp = requests.get(search_url.format(city, str(1))) page = bs4.BeautifulSoup(resp.text, features='html.parser') page_count = get_page_count(page) for page_n in tqdm(range(1, page_count + 1)): resp = requests.get(search_url.format(city, str(page_n))) page = bs4.BeautifulSoup(resp.text, features='html.parser') boxes = page.findAll('div', attrs={'class': 'list-item-container'}) for b in boxes: try: link = b.find('a', attrs={'class': 'list-item-link'})['href'] # rental-apartment/espoo/suurpelto/block+of+flats/722129?entryPoint=fromSearch&rentalIndex=1 id = re.search('([\d]+?)\?', link).group(1) if re.search( '([\d]+?)\?', link) is not None else '' available_from = clean_text( b.find( 'span', attrs={ 'class': 'showing-lease-container' }).li.text) if len( b.find('span', attrs={ 'class': 'showing-lease-container' }).findAll('li')) > 0 else '' address = clean_text( b.find('span', attrs={ 'class': 'address' }).text) if len( b.findAll('span', attrs={'class': 'address' })) > 0 else '' meta = b.find('ul', attrs={'class': 'list-unstyled'}) price = clean_text( meta.find('span', attrs={ 'class': 'price' }).text) if len( b.findAll('span', attrs={'class': 'price' })) > 0 else '0' price = re.search( '([\d ]+)(?:[\d,]+)? €\/kk$', price).group(1).replace( ' ', '') if re.search('([\d ]+)(?:[\d,]+)? €\/kk$', price) is not None else '0' typ_and_area = meta.find('li').text if len( meta.findAll('li')) > 0 else '' typ = typ_and_area.split( ',')[0].strip() if len(typ_and_area) > 0 else '' area = typ_and_area.split(',')[1].replace( 'm²', '').strip() if len(typ_and_area) > 0 else '' details = meta.findAll('li')[1].text.strip() if len( meta.findAll('li')) > 1 else '' ''' company = b.find('div', attrs={'class': 'hidden-xs col-sm-3 col-4'}).a.img['alt'] if \ len(b.findAll('div', attrs={'class': 'hidden-xs col-sm-3 col-4'})) > 0 \ and len(b.find('div', attrs={'class': 'hidden-xs col-sm-3 col-4'}).findAll('a')) > 0 else '' ''' offers = offers.append( { 'link': base_url + link[1:], 'id': id, 'available_from': available_from, 'details': details, 'type': typ, 'city': city, 'place': address, 'price': price, #'company': company, 'area': area }, ignore_index=True) except Exception as e: print(e) continue return offers
def fetch_requests(): """ Ultra-custom API endpoint for serving up requests. Supports limit, search, and page parameters and returns json with an object that has a list of results in the 'objects' field. """ user_id = get_user_id() results = db.session.query(Request) # Filters! results = filter_department(department_name = request.args.get('department'), results = results) results = filter_search_term(search_input = request.args.get('search_term'), results = results) # Accumulate status filters status_filters = [] if str(request.args.get('open')).lower() == 'true': status_filters.append(Request.open) if str(request.args.get('closed')).lower() == 'true': status_filters.append(Request.closed) date_format = '%m/%d/%Y' min_request_date = request.args.get('min_request_date') max_request_date = request.args.get('max_request_date') if min_request_date and max_request_date: min_request_date = datetime.strptime(min_request_date, date_format) max_request_date = datetime.strptime(max_request_date, date_format) results = results.filter(and_(Request.date_created >= min_request_date, Request.date_created <= max_request_date)) app.logger.info('Request Date Bounding. Min: {0}, Max: {1}'.format(min_request_date, max_request_date)) min_due_date = request.args.get('min_due_date') max_due_date = request.args.get('max_due_date') if min_due_date and max_due_date: min_due_date = datetime.strptime(min_due_date, date_format) max_due_date = datetime.strptime(max_due_date, date_format) results = results.filter(and_(Request.due_date >= min_due_date, Request.due_date <= max_due_date)) app.logger.info('Due Date Bounding. Min: {0}, Max: {1}'.format(min_due_date, max_due_date)) # Filters for agency staff only: if user_id: if str(request.args.get('due_soon')).lower() == 'true': status_filters.append(Request.due_soon) if str(request.args.get('overdue')).lower() == 'true': status_filters.append(Request.overdue) # Where am I the Point of Contact? if str(request.args.get('mine_as_poc')).lower() == 'true': results = results.filter(Request.id == Owner.request_id) \ .filter(Owner.user_id == user_id) \ .filter(Owner.is_point_person == True) # Where am I just a Helper? if str(request.args.get('mine_as_helper')).lower() == 'true': results = results.filter(Request.id == Owner.request_id) \ .filter(Owner.user_id == user_id) \ .filter(Owner.active == True) # Filter based on requester name requester_name = request.args.get('requester_name') if requester_name and requester_name != "": results = results.join(Subscriber, Request.subscribers).join(User).filter(func.lower(User.alias).like("%%%s%%" % requester_name.lower())) # Apply the set of status filters to the query. # Using 'or', they're non-exclusive! results = results.filter(or_(*status_filters)) app.logger.info(status_filters) app.logger.info(str(results.statement.compile(dialect=postgresql.dialect()))) sort_by = request.args.get('sort_column') if sort_by and sort_by != '': ascending = request.args.get('sort_direction') app.logger.info("Sort Direction: %s" % ascending) app.logger.info("Sort Column: %s" % sort_by) if ascending == "asc": results = results.order_by((getattr(Request, sort_by)).asc()) else: results = results.order_by((getattr(Request, sort_by)).desc()) results = results.order_by(Request.id.desc()) page_number = int(request.args.get('page_number') or 1) limit = int(request.args.get('limit') or 15) offset = limit * (page_number - 1) app.logger.info("Page Number: {0}, Limit: {1}, Offset: {2}".format(page_number, limit, offset)) # Execute query more_results = False num_results = results.count() start_index = 0 end_index = 0 if num_results != 0: start_index = (page_number - 1) * limit if start_index == 0: start_index = 1 if num_results > (limit * page_number): more_results = True end_index = start_index + 14 else: end_index = num_results results = results.limit(limit).offset(offset).all() # TODO([email protected]): This map is pretty kludgy, we should be detecting columns and auto # magically making them fields in the JSON objects we return. results = map(lambda r: { "id": r.id, \ "text": helpers.clean_text(r.text), \ "date_created": helpers.date(r.date_received or r.date_created), \ "department": r.department_name(), \ "requester": r.requester_name(), \ "due_date": format_date(r.due_date), \ "status": r.status, \ # The following two attributes are defined as model methods, # and not regular SQLAlchemy attributes. "contact_name": r.point_person_name(), \ "solid_status": r.solid_status() }, results) matches = { "objects": results, "num_results": num_results, "more_results": more_results, "start_index": start_index, "end_index": end_index } response = anyjson.serialize(matches) return Response(response, mimetype = "application/json")
def render_highlights(self): print("Renderizando páginas de destaque") tags_highlight = self.get_tags_highlight() for tag in tags_highlight: chat_messages = calls = smss = contacts = images = videos = audios = None query = db_session.query(Message).join(Chat).filter( Message.tags.any(Tag.id == tag.id)).order_by(Chat.id.asc(), Message.timestamp.asc()) chat_messages = self.report_bundle.filter(Message, query).all() query = db_session.query(Call).filter( Call.tags.any(Tag.id == tag.id)) calls = self.report_bundle.filter(Call, query).all() query = db_session.query(Sms).filter(Sms.tags.any( Tag.id == tag.id)) smss = self.report_bundle.filter(Sms, query).all() query = db_session.query(Contact).filter( Contact.tags.any(Tag.id == tag.id)) contacts = self.report_bundle.filter(Contact, query).all() query = db_session.query(File).filter( File.type_ == 'image', File.tags.any(Tag.id == tag.id)) images = self.report_bundle.filter(File, query).all() query = db_session.query(File).filter( File.type_ == 'audio', File.tags.any(Tag.id == tag.id)) audios = self.report_bundle.filter(File, query).all() query = db_session.query(File).filter( File.type_ == 'video', File.tags.any(Tag.id == tag.id)) videos = self.report_bundle.filter(File, query).all() context = {'chat_messages': chat_messages, 'calls': calls, 'smss': smss, 'contacts': contacts, 'images': images, 'videos': videos, 'audios': audios, 'title': tag.name, 'description': tag.description} dest_file = os.path.join( self.report_bundle.report_folder, 'html_files', "highlights_{}.html".format(clean_text(tag.name))) self.renderizer.render_template('highlights.html', dest_file, context)
def main(): links = pd.read_csv('etuovi_links.csv')['link'].values offers = pd.DataFrame() #with open('C:/Users/shadow/Downloads/etuovi_test.html', 'r', encoding='utf8') as f: # file = f.read() for l in tqdm(links): resp = requests.get(l) page = bs4.BeautifulSoup(resp.text, 'lxml') keys = page.findAll('div', attrs={'class': 'ItemHeader__itemHeader__32xAv'}) values = page.findAll( 'div', attrs={'class': 'CompactInfoRow__content__3jGt4'}) details = {} for i in range(len(keys)): if len(values[i].findAll('ul')) > 0: resp_value = clean_text(' '.join( [li.text for li in values[i].find('ul').findAll('li')])) else: resp_value = clean_text(values[i].text.strip()) details[keys[i].text.strip()] = resp_value selling_price = convert_price( details['Myyntihinta']) if 'Myyntihinta' in details.keys() else '' debt_component = convert_price( details['Velkaosuus']) if 'Velkaosuus' in details.keys() else '' total_price = convert_price(re.search('^([\d]+)', convert_price(details['Velaton hinta'])).group(1)) \ if 'Velaton hinta' in details.keys() \ and re.search('^([\d,]+)', details['Velaton hinta']) is not None \ else '' total_monthly_fee = details[ 'Yhtiövastike'] if 'Yhtiövastike' in details.keys() else '' monthly_fee = re.search( '^([\d,\s]+)', total_monthly_fee).group(1).replace( ',', '.').replace(' ', '') if re.search( '^([\d,\s]+)', total_monthly_fee) is not None else '' maintainance_fee = re.search( 'Hoitovastike ([\d,\s]+)', total_monthly_fee).group(1).replace( ',', '.').replace(' ', '') if re.search( 'Hoitovastike ([\d,\s]+)', total_monthly_fee) is not None else '' financial_fee = re.search( 'Rahoitusvastike ([\d,\s]+)', total_monthly_fee).group(1).replace( ',', '.').replace(' ', '') if re.search( 'Rahoitusvastike ([\d,\s]+)', total_monthly_fee) is not None else '' floor = details['Kerrokset'] if 'Kerrokset' in details.keys() else '' communications = details[ 'Liikenneyhteydet'] if 'Liikenneyhteydet' in details.keys() else '' offers = offers.append( { 'link': l, 'total_price': total_price, 'selling_price': selling_price, 'debt_component': debt_component, 'total_monthly_fee': monthly_fee, 'maintainance_fee': maintainance_fee, 'financial_fee': financial_fee, 'floor': floor, 'communications': communications, 'details': str(details) }, ignore_index=True) offers.to_csv('etuovi_details.tsv', sep='\t', index=False)
def crawlLinks(page_count): offers = pd.DataFrame() for page_n in tqdm(range(1, page_count + 1)): resp = requests.get(search_url.format(str(page_n))) page = bs4.BeautifulSoup(resp.text, features='html.parser') boxes = page.findAll('div', attrs={'class': 'property_holder'}) for b in boxes: try: id = b.findAll('input', attrs={'id': 'estateId'})[0]['value'] link = clean_text( b.findAll('a', attrs={'class': 'detail'})[0]['href']) link = re.search('^(.*)\?', link).group(1) if re.search( '^(.*)\?', link) is not None else '' city = b.findAll('input', attrs={'id': 'cityName'})[0]['value'] nbhd = b.findAll('input', attrs={'id': 'quarterName'})[0]['value'] typ = b.findAll('img', attrs={'class': 'estate_image'})[0]['alt'] labels = ', '.join([ l['alt'] for l in b.findAll('div', attrs={'class': 'estate-labels'}) [0].findAll('img') ]) desc = b.findAll('div', attrs={'class': 'description'})[0].text broker_info = b.findAll('div', attrs={'class': 'broker-info'})[0].text price = b.findAll('input', attrs={'id': 'formattedPrice'})[0]['value'] if 'EUR' in price: price = price.replace('EUR', '').replace(' ', '') currency = 'EUR' elif 'BGN' in price: price = str( round( float(price.replace('BGN', '').replace(' ', '')) / 1.9558)) currency = 'EUR' offers = offers.append( { 'link': clean_text(link), 'id': id, 'type': clean_text(typ), 'labels': clean_text(labels), 'city': clean_text(city), 'place': clean_text(nbhd), 'price': clean_text(price), 'currency': clean_text(currency), 'broker_info': clean_text(broker_info), 'description': clean_text(desc) }, ignore_index=True) except Exception as e: print(e) continue return offers
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") category = page.select( '.gtm-ArticleBreadcrumb-click' )[0].text if len( page.select('.gtm-ArticleBreadcrumb-click')) > 0 else '' headline = page.select('.title-wrap-roboto')[0].h1.text.strip( ) if len(page.select('.title-wrap-roboto')) > 0 else '' # Гледайте цялата емисия if headline == '': continue subtitle = page.select('.article-sub-title')[0].text.strip( ) if len(page.select('.article-sub-title')) > 0 else '' #author = page.select('.author-name') #author = author[0].text if author is not None else None # 21 ноември 2019 19:42 articleDate = page.select('.date-time')[0].text if len( page.select('.date-time')) > 0 else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name) ) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d %m %Y %H:%M') article_body = page.select('.article-body')[0].find_all( 'p', a=False) if len(page.select('.article-body')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in article_body if 'ГАЛЕРИЯ' not in par and 'СНИМКИ' not in par and 'ВИДЕО' not in par ]) #tags tags_start_phrase = 'w2g.targeting = ' start_ind = rq.text.find(tags_start_phrase) end_ind = rq.text.find(';', start_ind) aoi = rq.text[start_ind + len(tags_start_phrase):end_ind].strip() tags = re.findall('([а-яА-Я]+)', aoi) tags = ' - '.join( clean_text(tag.replace("'", '').strip()) for tag in tags) if len(tags) > 0 else None #shares # shares = page.select('.inlineBlock')[1].select('.span')[-1].text """ function getCookie(k) { return (document.cookie.match('(^|; )' + k + '=([^;]*)') | | 0)[2] } // header bidding targeting.Main script is loaded via GTM var w2g = w2g | | {}; w2g.targeting = { cid: 'news', bid: 'view', aid: '273680', catid: '12', subcatid: '4', procatid: '1', prpage: '0', safe: '1', tag: 'тенис', tag: 'джейми', tag: 'мъри', tag: 'григор', tag: 'димитров', tag: 'александър', tag: 'лазаров', tag: 'великобритания', tag: 'българия' }; """ articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), #'author': clean_text(author), 'date': articleDate, 'tags': tags, #'shares': shares, 'category': category, 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links, nbbhds, current_date): offers = pd.DataFrame(data={'link': []}) visited_offers = [] for ind in tqdm(range(len(links))): link = links[ind] # temp if link in visited_offers: continue try: resp = requests.get(link) page = bs4.BeautifulSoup(resp.content.decode('cp1251'), 'html.parser') page = page.find_all('div', attrs={'class': 'content'})[0] id = re.search('=([\d\w]+)$', link).group(1) lon = page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})[0]['value'].split(',')[0] \ if len(page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})) > 0 \ else '' lat = page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})[0]['value'].split(',')[1] \ if len(page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})) > 0 \ else '' address = clean_text(page.find_all('div', attrs={'class': 'title'})[0].find_all('span')[0].text.replace('Виж на картата', '')) \ if len(page.find_all('div', attrs={'class': 'title'})[0].find_all('span')) > 0 \ else '' poly = clean_text(page.find_all('input', attrs={'name': 'p', 'type': 'hidden'})[0]['value']) \ if len(page.find_all('input', attrs={'name': 'p', 'type': 'hidden'})) > 0 \ else '' details_li = page.find_all('ul', attrs={'class': 'param'})[0].find_all('li') details = get_details(details_li) price = clean_text(page.find_all('div', {'id': re.compile('^price$')})[0].text) price_sq = clean_text(page.find_all('em', {'id': re.compile('^price_kv$')})[0].text) agency = get_agency(page) views = page.find_all('span', {'class': 'num'})[0].text.replace(' ', '') date = page.find_all('span', {'class': 'date'})[0].text date = get_date(date) desc = get_desc(page) area = details['Квадратура'] if 'Квадратура' in details.keys() else '' floor = details['Етаж'] if 'Етаж' in details.keys() else '' title = clean_text(page.find_all('div', attrs={'class': 'title'})[0].text) \ if len(page.find_all('div', attrs={'class': 'title'})) > 0 \ else '' current_offer = pd.DataFrame(data={'link': link, 'title': title, 'address': address, 'details': json.dumps(details, ensure_ascii=False), 'neighbourhood': nbbhds[ind].split(',')[0], 'lon': lon, 'lat': lat, 'id': id, 'price': price, 'price_sqm': price_sq, 'area': area, 'floor': floor, 'description': desc, 'views': views, 'date': date, 'agency': agency, 'poly': poly}, index=[0]) #import pdb; pdb.set_trace() offers = pd.concat([offers, current_offer], ignore_index=True) except Exception as e: print(e) continue return offers
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") titles = page.select('.text-wrapper')[0] headline = titles.h2.text subtitle = page.select('.text-wrapper')[0].p.text meta = page.select('.additional-info')[0] if len( page.select('.additional-info')) > 0 else '' date_author_info = clean_text( meta.select('.timestamp')[0].text) if len( meta.select('.timestamp')) > 0 else '' author = re.search(':([А-Яа-я\s]+$)', date_author_info) author = author.group( 1).strip() if author is not None else None # 10:21 27 декември 2019 articleDate = ' '.join(date_author_info.split( '|')[0:2]).strip() if date_author_info != '' else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) \ if month_name is not None else articleDate articleDate = pd.to_datetime( articleDate, format='%H:%M %d %m %Y') views = meta.select('#articleViews')[0].text if len( meta.select('#articleViews')) > 0 else '' comments = meta.select('.comments')[0].text if len( meta.select('.comments')) > 0 else '' article_text = ' '.join([ par.text.strip() for par in page.select('.article-body')[0].select('p') ]) if len(page.select('.article-body')) > 0 else '' """ window._io_config=window._io_config||{};window._io_config["0.2.0"]=window._io_config["0.2.0"]||[];window._io_config["0.2.0"].push({"page_url":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga" "page_url_canonical":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga" "page_title":"\u0412\u041a\u0421:\u0418\u0434\u0430\u043e\u0442\u043a\u0440\u0430\u0434\u043d\u0430\u0442\u043a\u043e\u043b\u0430\u0442\u0430 \u0442\u0440\u044f\u0431\u0432\u0430\u0434\u0430\u0441\u0438\u043f\u043b\u0430\u0449\u0430\u0448\u043b\u0438\u0437\u0438\u043d\u0433\u0430|\u0414\u043d\u0435\u0441.dir.bg" "page_type":"article" "page_language":"bg" "article_authors":["\u041a\u0430\u043b\u0438\u043d\u041a\u0430\u043c\u0435\u043d\u043e\u0432"] "article_categories":["\u0422\u0435\u043c\u0438\u0434\u0430"] "article_subcategories":[] "article_type":"image" "article_word_count":425 "article_publication_date":"Fri 03Jan2020:52:40+0200"}); """ articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'comments': clean_text(comments), 'author': clean_text(author), 'date': articleDate, 'views': clean_text(views), 'category': category, 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") headline = page.select('h1')[0].text if len( page.select('h1')) > 0 else '' author = page.select('.author') author = author[0].select( 'a')[0].text if author is not None else None # 30.12.2019 13:02:31 articleDate = clean_text( page.select('.article-info')[0].select('p') [0].text) if len(page.select('.article-info')) > 0 else '' articleDate = pd.to_datetime( articleDate, format='%d.%m.%Y %H:%M:%S') if articleDate != '' else '' views = page.select('.article-info')[0].div.p.text if len( page.select('.article-info')) > 0 else '' views = views.split(" ")[1] if views != '' else '' comments = page.select('.comments')[0].span.text if len( page.select('.comments')) > 0 else '' tags = ' - '.join([clean_text(tag.text) for tag in page.select('.tags')[0].select('a') if tag != ',' and tag != "\n"])\ if len(page.select('.tags')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in page.select('.article-text')[0].select('p') ]) thumbs = page.select('.rate')[0].select('a') if len( page.select('.rate')) else '' thumbs_up = clean_text(thumbs[0].text) if thumbs != '' else '' thumbs_down = clean_text( thumbs[1].text) if thumbs != '' else '' articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'comments': clean_text(comments), 'author': clean_text(author), 'date': articleDate, 'views': clean_text(views), 'category': category, 'tags': tags, 'article_text': article_text, 'thumbs_up': thumbs_up, 'thumbs_down': thumbs_down }, ignore_index=True) except: continue return articles_content