Python clean_textの例、helpers.clean_text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: site5.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                meta = page.select('.head')[0]
                headline = meta.h1.text.strip()

                # Вижте 50-те най-четени мнения в сайта ни за годината
                if headline == '':
                    continue
                info = clean_text(
                    meta.select('.article-date')[0].text.split('(')[0]) if len(
                        meta.select('.article-date')) > 0 else ''

                # 30.12.2019 10:33
                articleDate = info.split(';')[0] if info != '' else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name)
                    ) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d.%m.%Y  %H:%M')

                author = info.split(';')[1] if ';' in info else None
                views = requests.get(
                    'https://www.24chasa.bg/Article/{id}/4'.format(
                        id=re.search('(\d+)$', link).group(1))).text
                article_text = ' '.join([
                    clean_text(par.text)
                    for par in page.select('.content')[0].select('p')
                ]).split('Tweet')[0] if len(
                    page.select('.content')) > 0 else ''

                # shares - will need selenium for that
                # shares = page.select('.inlineBlock')[1].select('.span')[-1].text

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'author': clean_text(author),
                        'date': articleDate,
                        #'shares': shares,
                        'views': clean_text(views),
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content

コード例 #2

0

ファイルを表示

ファイル: views.py プロジェクト: codeforamerica/recordtrac

def prepare_request_fields(results):
	if current_user.is_anonymous():
		return map(lambda r: {
			  "id":           r.id, \
			  "text":         helpers.clean_text(r.text), \
			  "date_received": helpers.format_datetime(r.date_received or r.date_created, '%b %d, %Y at %-I:%M %p'), \
			  "department":   r.department_name(), \
			  "status":       r.status, \
			  # The following two attributes are defined as model methods,
			  # and not regular SQLAlchemy attributes.
			  "contact_name": r.point_person_name(), \
			  "solid_status": r.solid_status()
			   }, results)
	else:
		return map(lambda r: {
			  "id":           r.id, \
			  "text":         helpers.clean_text(r.text), \
			  "date_received": helpers.date(r.date_received or r.date_created), \
			  "department":   r.department_name(), \
			  "requester":    r.requester_name(), \
			  "due_date":     format_date(r.due_date), \
			  "status":       r.status, \
			  # The following two attributes are defined as model methods,
			  # and not regular SQLAlchemy attributes.
			  "contact_name": r.point_person_name(), \
			  "solid_status": r.solid_status()
			   }, results)

コード例 #3

0

ファイルを表示

def prepare_request_fields(results):
    if current_user.is_anonymous():
        return map(lambda r: {
           "id":           r.id, \
           "text":         helpers.clean_text(r.text), \
           "date_received": helpers.format_datetime(r.date_received or r.date_created, '%b %d, %Y at %-I:%M %p'), \
           "department":   r.department_name(), \
           "status":       r.status, \
			  # The following two attributes are defined as model methods,
			  # and not regular SQLAlchemy attributes.


           "contact_name": r.point_person_name(), \
           "solid_status": r.solid_status()
            }, results)
    else:
        return map(lambda r: {
           "id":           r.id, \
           "text":         helpers.clean_text(r.text), \
           "date_received": helpers.date(r.date_received or r.date_created), \
           "department":   r.department_name(), \
           "requester":    r.requester_name(), \
           "due_date":     format_date(r.due_date), \
           "status":       r.status, \
			  # The following two attributes are defined as model methods,
			  # and not regular SQLAlchemy attributes.


           "contact_name": r.point_person_name(), \
           "solid_status": r.solid_status()
            }, results)

コード例 #4

0

ファイルを表示

def gather_new_articles(site):
    request = requests.get(site)
    soup = bs4.BeautifulSoup(request.text, features="html.parser")

    all_articles = list(
        set(soup.findAll('a', attrs={'href': re.compile('^/.*\.html$')})))
    #import pdb; pdb.set_trace()
    articles_content = pd.DataFrame()
    for a in all_articles:
        try:
            title = a['title']
            link = site + a['href']
            comments = a.find('span', attrs={
                'class': 'cmc'
            }).text if a.find('span', attrs={'class': 'cmc'}) else ''
            views = a.find('span', attrs={
                'class': 'cmv'
            }).text if a.find('span', attrs={'class': 'cmv'}) else ''
            date = a.find('span', attrs={
                'class': 'cmd'
            }).text if a.find('span', attrs={'class': 'cmd'}) else ''
            desc = a.find('span', attrs={
                'class': 'short-desc'
            }).text if a.find('span', attrs={'class': 'short-desc'}) else ''

            articles_content = articles_content.append(
                {
                    'link':
                    link,
                    'title':
                    clean_text(title),
                    'comments':
                    clean_text(comments),
                    'views':
                    clean_text(views),
                    'category':
                    re.search('frognews\.bg//(\w+)', link).group(1)
                    if re.search('frognews\.bg//(\w+)', link) else '',
                    'date':
                    clean_text(date),
                    'subtitle':
                    clean_text(desc)
                },
                ignore_index=True)
        except:
            continue

    return articles_content

コード例 #5

0

ファイルを表示

ファイル: holmes.py プロジェクト: aerodinamicc/data_collection

def get_desc(page):
    desc_start_phrase = '<b>Допълнителна информация:</b><br/>'
    desc_end_phrase = '<'
    desc_start_ind = str(page).find(desc_start_phrase) + len(desc_start_phrase)
    desc_end_ind = str(page).find(desc_end_phrase, desc_start_ind)
    desc = str(page)[desc_start_ind:desc_end_ind] if desc_start_ind > 100 else ''

    return clean_text(desc)

コード例 #6

0

ファイルを表示

ファイル: site8.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                headline = page.select('.content')[0].h1.text.strip()
                meta = clean_text(page.select('.article-tools')[0].text) if len(page.select('.article-tools')) > 0 else '' 

                # 14:41, 30 дек 19
                articleDate = re.search('(.*),', meta).group(1) if re.search('(.*),', meta) is not None else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(1) if month_name is not None else None
                    articleDate = articleDate.replace(month_name, replace_month_with_digit(
                        month_name)) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate, format='%H:%M, %d %m %y')

                views = re.search('(\d+)$', meta).group(1) if re.search('(\d+)$', meta) is not None else ''
                comments = page.select('.comments')[0].text.strip() if len(page.select('.comments')) > 0 else ''
                article_body = page.select('.article-content')[0].select('p') if len(page.select('.article-content')) > 0 else ''
                if article_body != '':
                    author = article_body[0].text
                    article_text = ' '.join([clean_text(par.text)
                                            for par in article_body[1:] if '<' not in par.text])
                    article_text = article_text[article_text.find('}') + 1:].strip()
                else:
                    article_text = ''
                    author = ''

                tags = ' - '.join(
                    [clean_text(tag.text) for tag in page.select('.tags')[0].select('li') if tag != ',' and tag != "\n"]) \
                    if len(page.select('.tags')) > 0 else None
                tags = clean_text(tags)
                
                articles_content = articles_content.append({'link': link,
                                                            'title': clean_text(headline),
                                                            'author': clean_text(author),
                                                            'date': articleDate,
                                                            'category': category,
                                                            'comments': clean_text(comments),
                                                            'views': clean_text(views),
                                                            'tags': tags,
                                                            'article_text': article_text},
                                                           ignore_index=True)
        except:
            continue

    return articles_content

コード例 #7

0

ファイルを表示

 def __iter__(self):
     for fname in self.fnames:
         with open('text/hemingway/{}'.format(fname), 'rb') as f:
             # use 'rb' to avoid decode attempt on read, which would stop
             # at unrecognizable characters
             raw = f.read().decode('utf-8', 'ignore') # immediately decode to string
             word_count = 0
             for sentence in raw.split('.'): # split on periods between sentences
                 word_count += len(sentence.split())
                 words = nltk.word_tokenize(clean_text(sentence))
                 yield words # yield sentence as list of words
             print('TRAINING ON A CORPUS OF {} WORDS'.format(word_count))

コード例 #8

0

ファイルを表示

def crawlLinks(links):
    articlesContent = pd.DataFrame()

    for link, section in tqdm(list(links)):
        try:
            rq = requests.get(link)
            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                articleTitle = page.select('h1')[0].text if len(
                    page.select('h1')) > 0 else ''
                articleSubtitle = page.select('h2.subtitle')[0].text if len(
                    page.select('h2.subtitle')) > 0 else ''

                articleDate = page.select(
                    '.article-time')[0].text.split(', oбновена')[0] if len(
                        page.select('.article-time')) > 0 else ''
                articleDate = clean_text(articleDate)
                month_name = re.search('([а-яА-Я]+)', articleDate)
                if month_name is not None:
                    month_name = month_name.group(1)
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name))
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d %m %Y,  %H:%M')

                category = page.select(
                    'div.article-category')[0].a.text if len(
                        page.select('div.article-category')) > 0 else ''
                comments = page.select('.commentsButtonNumber')[0].text if len(
                    page.select('.commentsButtonNumber')) > 0 else ''
                article_text = ' '.join([
                    clean_text(par.text)
                    for par in page.select('.article-text')[0].select('p')
                ])

                # article-tags
                tags = page.select('.article-tags')
                tags = ' - '.join([
                    clean_text(tag.text) for tag in tags[0].select('a')
                ]) if tags is not None else None

                articlesContent = articlesContent.append(
                    {
                        'link': link,
                        'section': section,
                        'comments': clean_text(comments),
                        'title': clean_text(articleTitle),
                        'subtitle': clean_text(articleSubtitle),
                        'date': articleDate,
                        'category': category,
                        'tags': tags,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articlesContent

コード例 #9

0

ファイルを表示

    data = []
    if YOUTUBE_CHANNEL_ID:
        data = get_comments(YOUTUBE_CHANNEL_ID)
    if TWITTER_HASHTAG:
        data.extend(get_tweets(TWITTER_HASHTAG, 'hashtag_search'))
    if TWITTER_REPLIES:
        data.extend(get_tweets('to:' + TWITTER_REPLIES, 'replies'))
    #else:
    #    sys.exit('Provide exactly one search parameter')
    #print(data)
    comment_type, links, posts, sentiment, magnitude, date, likes, shares, author_follower_count, author_country, topic = (
        [] for _ in range(11))

    for index, tweet in enumerate(data):
        #print(tweet)
        res = analyze_post(clean_text(tweet['text']))
        comment_type.append(tweet['type'])
        shares.append(tweet['shares'])
        posts.append(tweet['text'])
        date.append(tweet['created_at'])
        likes.append(tweet['likes'])
        links.append(tweet['link'])
        author_country.append(tweet['country'])
        author_follower_count.append(tweet['author_follower_count'])
        try:
            sentiment.append(round(res[0], 2))
            magnitude.append(round(res[1], 2))
            topic.append(res[2])
            print(index)
        except Exception as e:
            print('append sentiment to list', e)

コード例 #10

0

ファイルを表示

ファイル: site3.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articlesContent = pd.DataFrame()

    for link in tqdm(list(links)):
        try:    
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)
            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                if page.find({'class': 'article-post'}):
                    body = page.select('.article-post')[0]
                    headline = body.select('h1')[0].text if len(body.select('h1')) else ''
                    subtitle = None

                    #metadata
                    location = body.select('.location')[0].text if len(body.select('.location')) else ''
                    articleDate = body.select('.fa-calendar')[0].text if len(body.select('.fa-calendar')) else ''
                    views = body.select('.fa-eye')[0].text if len(body.select('.fa-eye')) else ''
                    comments = body.select('.fa-comments-o')[0].text if len(body.select('.fa-comments-o')) else ''
                    comments = comments.split(" ")[0] if comments != '' else ''
                    tags = ' - '.join([tag['a'].text for tag in body.select('.tags').select('li')])
                else: 
                    headline = page.select('.post-title')[0].text if len(page.select('.post-title')) else ''
                    subtitle = page.select('.post-subtitle')[0].text if len(page.select('.post-subtitle')) else ''

                    #metadata
                    simpleShare = page.select('.simple-share')[0] if len(page.select('.simple-share')) > 0 else ''
                    li = simpleShare.find_all('li')
                    location = li[0].text if len(li) > 0 else ''
                    articleDate = li[1].text if len(li) > 1 else ''
                    views = li[2].text if len(li) > 2 else ''
                    views = views.split(" ")[0] if views != '' else ''
                    comments = li[3].text if len(li) > 3 else ''
                    comments = comments.split(" ")[0] if comments != '' else ''
                    tags = ' - '.join([tag.a.text for tag in page.select('.tags-widget')[0].select('li')[1:]]) if len(page.select('.tags-widget')) > 0 else ''

                # 30 Дек. 2019, 16:13
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    if month_name is not None:
                        month_name = month_name.group(1)
                        articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name))
                        articleDate = pd.to_datetime(articleDate, format='%d %m %Y,  %H:%M')

                article_text = clean_text(page.select('.post-content')[0].select('div')[2].text) if len(page.select('.post-content')) > 0 else ''

                articlesContent = articlesContent.append({'link': link,
                                                          'title': clean_text(headline),
                                                          'subtitle': clean_text(subtitle),
                                                          'location': clean_text(location),
                                                          'comments': clean_text(comments),
                                                          'date': articleDate,
                                                          'views': clean_text(views),
                                                          'category': category,
                                                          'tags': clean_text(tags),
                                                          'article_text': article_text},
                                                         ignore_index=True)
        except:
            continue

    return articlesContent

コード例 #11

0

ファイルを表示

ファイル: site9.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                category = page.find(
                    'div', attrs={
                        'class': 'printing_large_text_toolbar'
                    }).text if page.find(
                        'div', attrs={'class': 'printing_large_text_toolbar'
                                      }) is not None else ''
                headline = page.select('#news_heading')[0].h1.text.strip(
                ) if len(page.select('#news_heading')) > 0 else ''
                shares = page.select(".social_count")[0].text.strip() if len(
                    page.select(".social_count")) > 0 else ''
                comments = page.select('.comments')[0].text.strip() if len(
                    page.select('.comments')) else ''
                views = page.select(
                    '.btn_reads')[0].text.split('Прочетена')[1].strip() if len(
                        page.select('.btn_reads')) > 0 else ''
                article_text = clean_text(
                    page.select('#news_content')[0].text) if len(
                        page.select('#news_content')) > 0 else ''

                # 01 януари 2020 | 16:26 - Обновена
                articleDate = page.find('td', attrs={'id': 'news_heading'})
                articleDate = articleDate.find(
                    'span', attrs={
                        'class': 'dark_text'
                    }).text if articleDate is not None and articleDate.find(
                        'span', attrs={'class': 'dark_text'
                                       }) is not None else ''
                articleDate = articleDate.split(
                    '- Обновена')[0].strip() if articleDate != '' else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name)
                    ) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d %m %Y | %H:%M')

                author = page.select(
                    '#author_box')[0].select('h5')[0].a.text if len(
                        page.select('#author_box')) > 0 else ''
                tags = " - ".join([
                    clean_text(i.text)
                    for i in page.find('div', attrs={
                        'class': 'news_tags'
                    }).findAll('span')
                ])

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'author': clean_text(author),
                        'date': articleDate,
                        'category': clean_text(category),
                        'tags': tags,
                        'comments': clean_text(comments),
                        'views': clean_text(views),
                        'shares': clean_text(shares),
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content

コード例 #12

0

ファイルを表示

ファイル: site7.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                info = page.select('#news_details')[0] if len(
                    page.select('#news_details')) > 0 else ''

                headline = info.h1.text.strip()
                subtitle = info.h2.text.strip()

                meta = info.select('.info')[0].select('div')

                # 30 Декември, 2019 15:26
                articleDate = meta[0].text.split('Публикувана:')[1].strip()
                month_name = re.search('([а-яА-Я]+)', articleDate)
                month_name = month_name.group(
                    1) if month_name is not None else None
                articleDate = articleDate.replace(
                    month_name, replace_month_with_digit(
                        month_name)) if month_name is not None else articleDate
                articleDate = pd.to_datetime(articleDate,
                                             format='%d %m, %Y %H:%M')

                meta = meta[1].text.strip() if len(meta) > 0 else ''
                if meta != '':
                    comments = re.search('(^\d+)', meta).group(1)
                    views = re.search('(\d+)$', meta).group(1)

                author = page.select('.linksProfile')[0].text if len(
                    page.select('.linksProfile')) > 0 else ''

                article_body = page.select('#news_content')[0].select(
                    'p') if len(page.select('#news_content')) > 0 else ''
                article_text = ' '.join([
                    clean_text(par.text) for par in article_body
                    if '<' not in par.text
                ]) if article_body != '' else ''
                tags = ' - '.join(
                    [clean_text(tag.text) for tag in page.select('.tags')[0].select('a') if tag != ',' and tag != "\n"]) \
                    if len(page.select('.tags')) > 0 else None

                #shares
                # shares = page.select('.inlineBlock')[1].select('.span')[-1].text

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'subtitle': clean_text(subtitle),
                        'author': clean_text(author),
                        'date': articleDate,
                        'category': category,
                        'comments': clean_text(comments),
                        # 'shares': shares,
                        'views': clean_text(views),
                        'tags': tags,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content

コード例 #13

0

ファイルを表示

def get_all_offers(search_pages):
    offers = pd.DataFrame()
    options = Options()
    options.headless = True
    options.add_argument('log-level=3')
    browser = webdriver.Chrome(ChromeDriverManager().install(),
                               options=options)

    #import pdb; pdb.set_trace()
    for p in tqdm(search_pages):

        browser.get(p)
        page = bs4.BeautifulSoup(browser.page_source, features='html.parser')
        #resp = requests.get(p)
        #page = bs4.BeautifulSoup(resp.content.decode('cp1251'), 'html')

        boxes = page.find_all('div', attrs={'class':
                                            'items'})[0].findAll('item')

        for b in boxes:
            try:
                link = b.find_all('text')[0].find_all('div',
                                                      attrs={'class': 'title'
                                                             })[0].a['href']
                title = b.find_all('text')[0].find_all(
                    'div', attrs={'class': 'title'})[0]
                data = b.find_all('text')[0].find_all('div',
                                                      attrs={'class':
                                                             'data'})[0].text
                info = b.find_all('text')[0].find_all('div',
                                                      attrs={'class':
                                                             'info'})[0]

                id = re.search('adv=(.*)$', link).group(1)
                place, labels = get_place_and_labels(
                    clean_text(title.a.text.replace('град София,', '')))

                area = re.search(
                    '(^[^А-Яа-я\.]*)',
                    data.split(',')[1].replace(' ', '')).group(
                        1) if len(data.split(',')) > 1 and re.search(
                            '(^[^А-Яа-я\.]*)',
                            data.split(',')[1].replace(' ', '')) else '0'
                price = clean_text(title.find_all('span')[0].text)
                price_orig = price

                price = re.search('([\d\s]+)', price).group(1).replace(
                    ' ', '') if re.search('([\d\s]+)', price) else '0'
                if 'Цена при запитване' in price_orig:
                    price = '0'
                elif 'eur' in price_orig.lower():
                    currency = 'EUR'
                elif 'лв' in price_orig.lower():
                    price = str(round(float(price) /
                                      1.9558)) if price != '0' else '0'
                    currency = 'BGN'

                if 'на кв.м' in price_orig:
                    #print('\n{} * {} = {}'.format(float(price), float(area), round(float(price) * float(area), 0)))
                    price = round(float(price) * float(area), 0)

                typ = clean_text(data.split(',')[0])
                agency = clean_text(
                    info.a['href']) if len(info.find_all('a')) > 0 else ''

                offers = offers.append(
                    {
                        'link': sale_url + link,
                        'id': id,
                        'type': typ,
                        'place': place,
                        'price': price,
                        'area': area,
                        'labels': labels,
                        'description': clean_text(info.text),
                        'currency': currency,
                        'agency': agency
                    },
                    ignore_index=True)

            except Exception as e:
                print(e)

    return offers

コード例 #14

0

ファイルを表示

def crawlLinks():
    offers = pd.DataFrame()
    for city in cities:
        resp = requests.get(search_url.format(city, str(1)))
        page = bs4.BeautifulSoup(resp.text, features='html.parser')
        page_count = get_page_count(page)

        for page_n in tqdm(range(1, page_count + 1)):
            resp = requests.get(search_url.format(city, str(page_n)))
            page = bs4.BeautifulSoup(resp.text, features='html.parser')

            boxes = page.findAll('div', attrs={'class': 'list-item-container'})
            for b in boxes:
                try:
                    link = b.find('a', attrs={'class':
                                              'list-item-link'})['href']
                    # rental-apartment/espoo/suurpelto/block+of+flats/722129?entryPoint=fromSearch&rentalIndex=1
                    id = re.search('([\d]+?)\?', link).group(1) if re.search(
                        '([\d]+?)\?', link) is not None else ''
                    available_from = clean_text(
                        b.find(
                            'span', attrs={
                                'class': 'showing-lease-container'
                            }).li.text) if len(
                                b.find('span',
                                       attrs={
                                           'class': 'showing-lease-container'
                                       }).findAll('li')) > 0 else ''
                    address = clean_text(
                        b.find('span', attrs={
                            'class': 'address'
                        }).text) if len(
                            b.findAll('span', attrs={'class': 'address'
                                                     })) > 0 else ''

                    meta = b.find('ul', attrs={'class': 'list-unstyled'})
                    price = clean_text(
                        meta.find('span', attrs={
                            'class': 'price'
                        }).text) if len(
                            b.findAll('span', attrs={'class': 'price'
                                                     })) > 0 else '0'
                    price = re.search(
                        '([\d ]+)(?:[\d,]+)? €\/kk$', price).group(1).replace(
                            ' ', '') if re.search('([\d ]+)(?:[\d,]+)? €\/kk$',
                                                  price) is not None else '0'
                    typ_and_area = meta.find('li').text if len(
                        meta.findAll('li')) > 0 else ''
                    typ = typ_and_area.split(
                        ',')[0].strip() if len(typ_and_area) > 0 else ''
                    area = typ_and_area.split(',')[1].replace(
                        'm²', '').strip() if len(typ_and_area) > 0 else ''
                    details = meta.findAll('li')[1].text.strip() if len(
                        meta.findAll('li')) > 1 else ''
                    '''
                    company = b.find('div', attrs={'class': 'hidden-xs col-sm-3 col-4'}).a.img['alt'] if \
                        len(b.findAll('div', attrs={'class': 'hidden-xs col-sm-3 col-4'})) > 0 \
                        and len(b.find('div', attrs={'class': 'hidden-xs col-sm-3 col-4'}).findAll('a')) > 0 else ''
                    '''

                    offers = offers.append(
                        {
                            'link': base_url + link[1:],
                            'id': id,
                            'available_from': available_from,
                            'details': details,
                            'type': typ,
                            'city': city,
                            'place': address,
                            'price': price,
                            #'company': company,
                            'area': area
                        },
                        ignore_index=True)

                except Exception as e:
                    print(e)
                    continue

    return offers

コード例 #15

0

ファイルを表示

ファイル: views.py プロジェクト: postcode/recordtrac

def fetch_requests():
	"""
	Ultra-custom API endpoint for serving up requests.
	Supports limit, search, and page parameters and returns json with an object that
	has a list of results in the 'objects' field.
	"""
	user_id = get_user_id()
	results = db.session.query(Request)

	# Filters!
	results = filter_department(department_name = request.args.get('department'), results = results)
	results = filter_search_term(search_input = request.args.get('search_term'), results = results)

	# Accumulate status filters
	status_filters = []

	if str(request.args.get('open')).lower() == 'true':
		status_filters.append(Request.open)

	if str(request.args.get('closed')).lower() == 'true':
		status_filters.append(Request.closed)

	date_format = '%m/%d/%Y'

	min_request_date = request.args.get('min_request_date')
	max_request_date = request.args.get('max_request_date')
	if min_request_date and max_request_date:
		min_request_date = datetime.strptime(min_request_date, date_format)
		max_request_date = datetime.strptime(max_request_date, date_format)
		results = results.filter(and_(Request.date_created >= min_request_date, Request.date_created <= max_request_date))
		app.logger.info('Request Date Bounding. Min: {0}, Max: {1}'.format(min_request_date, max_request_date))

	min_due_date = request.args.get('min_due_date')
	max_due_date = request.args.get('max_due_date')
	if min_due_date and max_due_date:
		min_due_date = datetime.strptime(min_due_date, date_format)
		max_due_date = datetime.strptime(max_due_date, date_format)
		results = results.filter(and_(Request.due_date >= min_due_date, Request.due_date <= max_due_date))
		app.logger.info('Due Date Bounding. Min: {0}, Max: {1}'.format(min_due_date, max_due_date))

	# Filters for agency staff only:
	if user_id:
		if str(request.args.get('due_soon')).lower() == 'true':
			status_filters.append(Request.due_soon)

		if str(request.args.get('overdue')).lower() == 'true':
			status_filters.append(Request.overdue)

		# Where am I the Point of Contact?
		if str(request.args.get('mine_as_poc')).lower() == 'true':
				results = results.filter(Request.id == Owner.request_id) \
								 .filter(Owner.user_id == user_id) \
								 .filter(Owner.is_point_person == True)

		# Where am I just a Helper?
		if str(request.args.get('mine_as_helper')).lower() == 'true':
				results = results.filter(Request.id == Owner.request_id) \
								 .filter(Owner.user_id == user_id) \
								 .filter(Owner.active == True)
		# Filter based on requester name
		requester_name = request.args.get('requester_name')
		if requester_name and requester_name != "":
			results = results.join(Subscriber, Request.subscribers).join(User).filter(func.lower(User.alias).like("%%%s%%" % requester_name.lower()))
			
	# Apply the set of status filters to the query.
	# Using 'or', they're non-exclusive!
	results = results.filter(or_(*status_filters))

	app.logger.info(status_filters)
	app.logger.info(str(results.statement.compile(dialect=postgresql.dialect())))

	sort_by = request.args.get('sort_column') 

	if sort_by and sort_by != '':
		ascending = request.args.get('sort_direction')
		app.logger.info("Sort Direction: %s" % ascending)
		app.logger.info("Sort Column: %s" % sort_by)
		if ascending == "asc":
			results = results.order_by((getattr(Request, sort_by)).asc())
		else:
			results = results.order_by((getattr(Request, sort_by)).desc())
	results = results.order_by(Request.id.desc())

	page_number = int(request.args.get('page_number') or 1)
	limit = int(request.args.get('limit') or 15)
	offset = limit * (page_number - 1)
	app.logger.info("Page Number: {0}, Limit: {1}, Offset: {2}".format(page_number, limit, offset))

	# Execute query
	more_results = False
	num_results = results.count()
	start_index = 0
	end_index = 0

	if num_results != 0:
		start_index = (page_number - 1) * limit
		if start_index == 0:
			start_index = 1
		if num_results > (limit * page_number):
			more_results = True
			end_index = start_index + 14
		else:
			end_index = num_results

	results = results.limit(limit).offset(offset).all()

	# TODO([email protected]): This map is pretty kludgy, we should be detecting columns and auto
	# magically making them fields in the JSON objects we return.
	results = map(lambda r: {     
		  "id":           r.id, \
		  "text":         helpers.clean_text(r.text), \
		  "date_created": helpers.date(r.date_received or r.date_created), \
		  "department":   r.department_name(), \
		  "requester":    r.requester_name(), \
		  "due_date":     format_date(r.due_date), \
		  "status":       r.status, \
		  # The following two attributes are defined as model methods,
		  # and not regular SQLAlchemy attributes.
		  "contact_name": r.point_person_name(), \
		  "solid_status": r.solid_status()
		   }, results)

	matches = {
		"objects": 		results,
		"num_results": 	num_results,
		"more_results": more_results,
		"start_index": 	start_index,
		"end_index": 	end_index
		}
	response = anyjson.serialize(matches)
	return Response(response, mimetype = "application/json")

コード例 #16

0

ファイルを表示

    def render_highlights(self):
        print("Renderizando páginas de destaque")
        tags_highlight = self.get_tags_highlight()

        for tag in tags_highlight:
            chat_messages = calls = smss = contacts = images = videos = audios = None
            query = db_session.query(Message).join(Chat).filter(
                Message.tags.any(Tag.id == tag.id)).order_by(Chat.id.asc(), Message.timestamp.asc())
            chat_messages = self.report_bundle.filter(Message, query).all()

            query = db_session.query(Call).filter(
                Call.tags.any(Tag.id == tag.id))
            calls = self.report_bundle.filter(Call, query).all()

            query = db_session.query(Sms).filter(Sms.tags.any(
                Tag.id == tag.id))
            smss = self.report_bundle.filter(Sms, query).all()

            query = db_session.query(Contact).filter(
                Contact.tags.any(Tag.id == tag.id))
            contacts = self.report_bundle.filter(Contact, query).all()

            query = db_session.query(File).filter(
                File.type_ == 'image', File.tags.any(Tag.id == tag.id))
            images = self.report_bundle.filter(File, query).all()

            query = db_session.query(File).filter(
                File.type_ == 'audio', File.tags.any(Tag.id == tag.id))
            audios = self.report_bundle.filter(File, query).all()

            query = db_session.query(File).filter(
                File.type_ == 'video', File.tags.any(Tag.id == tag.id))
            videos = self.report_bundle.filter(File, query).all()


            context = {'chat_messages': chat_messages, 'calls': calls, 'smss': smss,
                       'contacts': contacts, 'images': images, 'videos': videos,
                       'audios': audios, 'title': tag.name, 'description': tag.description}
            dest_file = os.path.join(
                self.report_bundle.report_folder, 'html_files', "highlights_{}.html".format(clean_text(tag.name)))

            self.renderizer.render_template('highlights.html', dest_file,
                                            context)

コード例 #17

0

ファイルを表示

def main():
    links = pd.read_csv('etuovi_links.csv')['link'].values
    offers = pd.DataFrame()

    #with open('C:/Users/shadow/Downloads/etuovi_test.html', 'r', encoding='utf8') as f:
    #    file = f.read()

    for l in tqdm(links):
        resp = requests.get(l)

        page = bs4.BeautifulSoup(resp.text, 'lxml')
        keys = page.findAll('div',
                            attrs={'class': 'ItemHeader__itemHeader__32xAv'})
        values = page.findAll(
            'div', attrs={'class': 'CompactInfoRow__content__3jGt4'})

        details = {}

        for i in range(len(keys)):
            if len(values[i].findAll('ul')) > 0:
                resp_value = clean_text(' '.join(
                    [li.text for li in values[i].find('ul').findAll('li')]))
            else:
                resp_value = clean_text(values[i].text.strip())

            details[keys[i].text.strip()] = resp_value

        selling_price = convert_price(
            details['Myyntihinta']) if 'Myyntihinta' in details.keys() else ''
        debt_component = convert_price(
            details['Velkaosuus']) if 'Velkaosuus' in details.keys() else ''
        total_price = convert_price(re.search('^([\d]+)', convert_price(details['Velaton hinta'])).group(1)) \
                        if 'Velaton hinta' in details.keys() \
                        and re.search('^([\d,]+)', details['Velaton hinta']) is not None \
                        else ''
        total_monthly_fee = details[
            'Yhtiövastike'] if 'Yhtiövastike' in details.keys() else ''
        monthly_fee = re.search(
            '^([\d,\s]+)', total_monthly_fee).group(1).replace(
                ',', '.').replace(' ', '') if re.search(
                    '^([\d,\s]+)', total_monthly_fee) is not None else ''
        maintainance_fee = re.search(
            'Hoitovastike ([\d,\s]+)', total_monthly_fee).group(1).replace(
                ',', '.').replace(' ', '') if re.search(
                    'Hoitovastike ([\d,\s]+)',
                    total_monthly_fee) is not None else ''
        financial_fee = re.search(
            'Rahoitusvastike ([\d,\s]+)', total_monthly_fee).group(1).replace(
                ',', '.').replace(' ', '') if re.search(
                    'Rahoitusvastike ([\d,\s]+)',
                    total_monthly_fee) is not None else ''
        floor = details['Kerrokset'] if 'Kerrokset' in details.keys() else ''
        communications = details[
            'Liikenneyhteydet'] if 'Liikenneyhteydet' in details.keys() else ''

        offers = offers.append(
            {
                'link': l,
                'total_price': total_price,
                'selling_price': selling_price,
                'debt_component': debt_component,
                'total_monthly_fee': monthly_fee,
                'maintainance_fee': maintainance_fee,
                'financial_fee': financial_fee,
                'floor': floor,
                'communications': communications,
                'details': str(details)
            },
            ignore_index=True)

    offers.to_csv('etuovi_details.tsv', sep='\t', index=False)

コード例 #18

0

ファイルを表示

def crawlLinks(page_count):
    offers = pd.DataFrame()

    for page_n in tqdm(range(1, page_count + 1)):
        resp = requests.get(search_url.format(str(page_n)))
        page = bs4.BeautifulSoup(resp.text, features='html.parser')
        boxes = page.findAll('div', attrs={'class': 'property_holder'})

        for b in boxes:
            try:
                id = b.findAll('input', attrs={'id': 'estateId'})[0]['value']

                link = clean_text(
                    b.findAll('a', attrs={'class': 'detail'})[0]['href'])
                link = re.search('^(.*)\?', link).group(1) if re.search(
                    '^(.*)\?', link) is not None else ''
                city = b.findAll('input', attrs={'id': 'cityName'})[0]['value']
                nbhd = b.findAll('input', attrs={'id':
                                                 'quarterName'})[0]['value']
                typ = b.findAll('img', attrs={'class':
                                              'estate_image'})[0]['alt']
                labels = ', '.join([
                    l['alt']
                    for l in b.findAll('div', attrs={'class': 'estate-labels'})
                    [0].findAll('img')
                ])
                desc = b.findAll('div', attrs={'class': 'description'})[0].text
                broker_info = b.findAll('div', attrs={'class':
                                                      'broker-info'})[0].text

                price = b.findAll('input',
                                  attrs={'id': 'formattedPrice'})[0]['value']
                if 'EUR' in price:
                    price = price.replace('EUR', '').replace(' ', '')
                    currency = 'EUR'
                elif 'BGN' in price:
                    price = str(
                        round(
                            float(price.replace('BGN', '').replace(' ', '')) /
                            1.9558))
                    currency = 'EUR'

                offers = offers.append(
                    {
                        'link': clean_text(link),
                        'id': id,
                        'type': clean_text(typ),
                        'labels': clean_text(labels),
                        'city': clean_text(city),
                        'place': clean_text(nbhd),
                        'price': clean_text(price),
                        'currency': clean_text(currency),
                        'broker_info': clean_text(broker_info),
                        'description': clean_text(desc)
                    },
                    ignore_index=True)

            except Exception as e:
                print(e)
                continue

    return offers

コード例 #19

0

ファイルを表示

ファイル: site6.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articles_content = pd.DataFrame()
    for link in tqdm(links):
        try:
            rq = requests.get(link)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                category = page.select(
                    '.gtm-ArticleBreadcrumb-click'
                )[0].text if len(
                    page.select('.gtm-ArticleBreadcrumb-click')) > 0 else ''
                headline = page.select('.title-wrap-roboto')[0].h1.text.strip(
                ) if len(page.select('.title-wrap-roboto')) > 0 else ''

                # Гледайте цялата емисия
                if headline == '':
                    continue

                subtitle = page.select('.article-sub-title')[0].text.strip(
                ) if len(page.select('.article-sub-title')) > 0 else ''
                #author = page.select('.author-name')
                #author = author[0].text if author is not None else None

                # 21 ноември 2019  19:42
                articleDate = page.select('.date-time')[0].text if len(
                    page.select('.date-time')) > 0 else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name)
                    ) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d %m %Y  %H:%M')

                article_body = page.select('.article-body')[0].find_all(
                    'p',
                    a=False) if len(page.select('.article-body')) > 0 else ''
                article_text = ' '.join([
                    clean_text(par.text) for par in article_body
                    if 'ГАЛЕРИЯ' not in par and 'СНИМКИ' not in par
                    and 'ВИДЕО' not in par
                ])

                #tags

                tags_start_phrase = 'w2g.targeting = '
                start_ind = rq.text.find(tags_start_phrase)
                end_ind = rq.text.find(';', start_ind)
                aoi = rq.text[start_ind +
                              len(tags_start_phrase):end_ind].strip()
                tags = re.findall('([а-яА-Я]+)', aoi)
                tags = ' - '.join(
                    clean_text(tag.replace("'", '').strip())
                    for tag in tags) if len(tags) > 0 else None

                #shares
                # shares = page.select('.inlineBlock')[1].select('.span')[-1].text
                """
                function
                getCookie(k)
                {
                return (document.cookie.match('(^|; )' + k + '=([^;]*)') | | 0)[2]
                }
                // header
                bidding
                targeting.Main
                script is loaded
                via
                GTM
                var
                w2g = w2g | | {};
                w2g.targeting = {
                cid: 'news',
                bid: 'view',
                aid: '273680',
                catid: '12',
                subcatid: '4',
                procatid: '1',
                prpage: '0',
                safe: '1',
                tag: 'тенис',
                tag: 'джейми',
                tag: 'мъри',
                tag: 'григор',
                tag: 'димитров',
                tag: 'александър',
                tag: 'лазаров',
                tag: 'великобритания',
                tag: 'българия'
            };
                """

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'subtitle': clean_text(subtitle),
                        #'author': clean_text(author),
                        'date': articleDate,
                        'tags': tags,
                        #'shares': shares,
                        'category': category,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content

コード例 #20

0

ファイルを表示

ファイル: holmes.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links, nbbhds, current_date):
    offers = pd.DataFrame(data={'link': []})
    visited_offers = []
	
    for ind in tqdm(range(len(links))):
        link = links[ind]
        # temp
        if link in visited_offers:
            continue
        try:
            resp = requests.get(link)
            page = bs4.BeautifulSoup(resp.content.decode('cp1251'), 'html.parser')
            page = page.find_all('div', attrs={'class': 'content'})[0]

            id = re.search('=([\d\w]+)$', link).group(1)
            lon = page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})[0]['value'].split(',')[0] \
                if len(page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})) > 0 \
                else ''
            lat = page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})[0]['value'].split(',')[1] \
                if len(page.find_all('input', attrs={'name': 'mapn', 'type': 'hidden'})) > 0 \
                else ''

            address = clean_text(page.find_all('div', attrs={'class': 'title'})[0].find_all('span')[0].text.replace('Виж на картата', '')) \
                if len(page.find_all('div', attrs={'class': 'title'})[0].find_all('span')) > 0 \
                else ''
            poly = clean_text(page.find_all('input', attrs={'name': 'p', 'type': 'hidden'})[0]['value']) \
                if len(page.find_all('input', attrs={'name': 'p', 'type': 'hidden'})) > 0 \
                else ''
            details_li = page.find_all('ul', attrs={'class': 'param'})[0].find_all('li')
            details = get_details(details_li)

            price = clean_text(page.find_all('div', {'id': re.compile('^price$')})[0].text)
            price_sq = clean_text(page.find_all('em', {'id': re.compile('^price_kv$')})[0].text)
            agency = get_agency(page)

            views = page.find_all('span', {'class': 'num'})[0].text.replace(' ', '')
            date = page.find_all('span', {'class': 'date'})[0].text
            date = get_date(date)
            desc = get_desc(page)
            area = details['Квадратура'] if 'Квадратура' in details.keys() else ''
            floor = details['Етаж'] if 'Етаж' in details.keys() else ''

            title = clean_text(page.find_all('div', attrs={'class': 'title'})[0].text) \
                if len(page.find_all('div', attrs={'class': 'title'})) > 0 \
                else ''

            current_offer = pd.DataFrame(data={'link': link,
                                               'title': title,
                                               'address': address,
                                               'details': json.dumps(details, ensure_ascii=False),
                                               'neighbourhood': nbbhds[ind].split(',')[0],
                                               'lon': lon,
                                               'lat': lat,
                                               'id': id,
                                               'price': price,
                                               'price_sqm': price_sq,
                                               'area': area,
                                               'floor': floor,
                                               'description': desc,
                                               'views': views,
                                               'date': date,
                                               'agency': agency,
                                               'poly': poly}, index=[0])
            #import pdb; pdb.set_trace()

            offers = pd.concat([offers, current_offer], ignore_index=True)

        except Exception as e:
            print(e)
            continue

    return offers

コード例 #21

0

ファイルを表示

ファイル: site4.py プロジェクト: aerodinamicc/data_collection

def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                titles = page.select('.text-wrapper')[0]
                headline = titles.h2.text
                subtitle = page.select('.text-wrapper')[0].p.text
                meta = page.select('.additional-info')[0] if len(
                    page.select('.additional-info')) > 0 else ''
                date_author_info = clean_text(
                    meta.select('.timestamp')[0].text) if len(
                        meta.select('.timestamp')) > 0 else ''
                author = re.search(':([А-Яа-я\s]+$)', date_author_info)
                author = author.group(
                    1).strip() if author is not None else None

                # 10:21                   27 декември 2019
                articleDate = ' '.join(date_author_info.split(
                    '|')[0:2]).strip() if date_author_info != '' else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) \
                        if month_name is not None else articleDate
                    articleDate = pd.to_datetime(
                        articleDate, format='%H:%M                   %d %m %Y')

                views = meta.select('#articleViews')[0].text if len(
                    meta.select('#articleViews')) > 0 else ''
                comments = meta.select('.comments')[0].text if len(
                    meta.select('.comments')) > 0 else ''
                article_text = ' '.join([
                    par.text.strip()
                    for par in page.select('.article-body')[0].select('p')
                ]) if len(page.select('.article-body')) > 0 else ''
                """
                window._io_config=window._io_config||{};window._io_config["0.2.0"]=window._io_config["0.2.0"]||[];window._io_config["0.2.0"].push({"page_url":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga"
"page_url_canonical":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga"
"page_title":"\u0412\u041a\u0421:\u0418\u0434\u0430\u043e\u0442\u043a\u0440\u0430\u0434\u043d\u0430\u0442\u043a\u043e\u043b\u0430\u0442\u0430
\u0442\u0440\u044f\u0431\u0432\u0430\u0434\u0430\u0441\u0438\u043f\u043b\u0430\u0449\u0430\u0448\u043b\u0438\u0437\u0438\u043d\u0433\u0430|\u0414\u043d\u0435\u0441.dir.bg"
"page_type":"article"
"page_language":"bg"
"article_authors":["\u041a\u0430\u043b\u0438\u043d\u041a\u0430\u043c\u0435\u043d\u043e\u0432"]
"article_categories":["\u0422\u0435\u043c\u0438\u0434\u0430"]
"article_subcategories":[]
"article_type":"image"
"article_word_count":425
"article_publication_date":"Fri
03Jan2020:52:40+0200"});
"""

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'subtitle': clean_text(subtitle),
                        'comments': clean_text(comments),
                        'author': clean_text(author),
                        'date': articleDate,
                        'views': clean_text(views),
                        'category': category,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content

コード例 #22

0

ファイルを表示

def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                headline = page.select('h1')[0].text if len(
                    page.select('h1')) > 0 else ''
                author = page.select('.author')
                author = author[0].select(
                    'a')[0].text if author is not None else None

                # 30.12.2019 13:02:31
                articleDate = clean_text(
                    page.select('.article-info')[0].select('p')
                    [0].text) if len(page.select('.article-info')) > 0 else ''
                articleDate = pd.to_datetime(
                    articleDate,
                    format='%d.%m.%Y %H:%M:%S') if articleDate != '' else ''

                views = page.select('.article-info')[0].div.p.text if len(
                    page.select('.article-info')) > 0 else ''
                views = views.split(" ")[1] if views != '' else ''
                comments = page.select('.comments')[0].span.text if len(
                    page.select('.comments')) > 0 else ''
                tags = ' - '.join([clean_text(tag.text) for tag in page.select('.tags')[0].select('a') if tag != ',' and tag != "\n"])\
                    if len(page.select('.tags')) > 0 else ''

                article_text = ' '.join([
                    clean_text(par.text)
                    for par in page.select('.article-text')[0].select('p')
                ])

                thumbs = page.select('.rate')[0].select('a') if len(
                    page.select('.rate')) else ''
                thumbs_up = clean_text(thumbs[0].text) if thumbs != '' else ''
                thumbs_down = clean_text(
                    thumbs[1].text) if thumbs != '' else ''

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'comments': clean_text(comments),
                        'author': clean_text(author),
                        'date': articleDate,
                        'views': clean_text(views),
                        'category': category,
                        'tags': tags,
                        'article_text': article_text,
                        'thumbs_up': thumbs_up,
                        'thumbs_down': thumbs_down
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content