Ejemplo n.º 1
0
def index():
    '''
    Return the probability for each genre.
    '''
    if not (request.query.text or request.query.url):
        return 'invoke with ?text= or ?url='

    if request.query.text:
        art = article.Article(text=request.query.text)
    elif request.query.url:
        art = article.Article(url=request.query.url)

    example = [art.features[f] for f in utilities.features]

    abs_path = os.path.dirname(os.path.realpath(__file__))
    clf = joblib.load(abs_path + os.sep + 'model.pkl')
    proba = clf.predict_proba([example])[0]

    resp = {}
    for i, p in enumerate(proba):
        resp[utilities.genres[i + 1][0].split('/')[0]] = str(proba[i])[:6]
    resp = json.dumps(resp)

    if request.query.callback:
        resp = request.query.callback + '(' + resp + ')'

    return resp
Ejemplo n.º 2
0
 def __update_wiki(self, link_list):
     """"
      Gets a list of tuples with articles and links and builds
      the network by the given values
     """
     for tuple_art in link_list:
         if tuple_art[0] not in self.__articles_dict:
             self.__articles_dict[tuple_art[0]] = ar.Article(tuple_art[0])
         if tuple_art[1] not in self.__articles_dict:
             self.__articles_dict[tuple_art[1]] = ar.Article(tuple_art[1])
         self.__articles_dict[tuple_art[0]].add_neighbor(
             self.__articles_dict[tuple_art[1]])
Ejemplo n.º 3
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags

    # HEADLINE
    headline = soup.find('h1').string

    # TOPIC
    topic = ''
    if len(soup.find_all('div', class_='category')) > 0:
        topic = soup.find_all('div', class_='category')[0].find('span').get('span')

    # AUTHOR
    author = ''
    if len(soup.find_all('span', class_='author')) > 0:
        author = soup.find_all('span', class_='author')[0].get_text()

    # TEXT_BODY
    text_body = soup.find_all('article', 'fullarticle')[0].get_text()
    text_body = ' '.join(text_body.split())

    # CREATION_DATE
    creation_date = ''
    if soup.find('time'):
        creation_date = soup.find('time').get('datetime')

    return article.Article(headline, link, text_body, 'http://www.german-times.com', 'german-times', author, topic,
                           date.today(), creation_date)
Ejemplo n.º 4
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags
    # HEADLINE
    headline = soup.find('h1', class_='entry-title title post_title').string

    # TOPIC
    topic = soup.find('span', class_='article_dots cat').string

    # AUTHOR
    author = soup.find('div', class_='von').contents[2][1:]

    # TEXT_BODY
    text_body = soup.find(
        'div', 'post_content_inner_wrapper content_inner_wrapper entry-content'
    ).get_text()

    # CREATION_DATE
    creation_date = soup.find('div',
                              class_='von').find('span',
                                                 class_='article_dots').string

    return article.Article(headline, link, text_body,
                           'https://www.theeuropean.de', 'theeuropean', author,
                           topic, date.today(), creation_date)
Ejemplo n.º 5
0
def predict(input_dir):
    '''
    Get genre probabilities for each text document in input directory.
    '''
    clf = joblib.load('model.pkl')

    with open('results.csv', 'wb') as fh:
        writer = csv.writer(fh, delimiter='\t')
        writer.writerow(['Filename'] + [utilities.genres[g][0].split('/')[0]
            for g in utilities.genres])

        for filename in [f for f in os.listdir(input_dir) if f.endswith('.txt')]:
            with open(input_dir + os.sep + filename) as ifh:
                print('Processing file: ' + filename)

                row = []
                row.append(filename)

                # Read input file
                doc = ifh.read().decode('utf-8')

                # Create article object and calculate features
                art = article.Article(text=doc)
                features = [art.features[f] for f in utilities.features]

                # Get probability for each genre
                proba = clf.predict_proba([features])[0]

                # Save results
                for g in utilities.genres:
                    row.append(str(proba[g - 1])[:6])
                writer.writerow(row)
                print(row[1:])
Ejemplo n.º 6
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags

    # HEADLINE
    headline = soup.find('h1').string

    # TOPIC
    topic = ''
    if len(soup.find_all('span', class_='item-containers')) > 0:
        topic = soup.find_all('span',
                              class_='item-containers')[0].find('a').get('a')

    topic = "" if topic is None else topic

    # AUTHOR
    author = ''
    if len(soup.find_all('span', class_='source')) > 0:
        author = soup.find_all('span', class_='source')[0].get_text()

    # TEXT_BODY
    if len(soup.find_all('div', 'art-text-inner')) > 0:
        text_body = soup.find_all('div', 'art-text-inner')[0].get_text()
        text_body = ' '.join(text_body.split())
    else:
        text_body = ''

    # CREATION_DATE
    creation_date = ''
    if soup.find('time'):
        creation_date = soup.find('time').get('datetime')

    return article.Article(headline.strip(), link, text_body,
                           'https://www.wprost.pl', 'wprost', author.strip(),
                           topic.strip(), date.today(), creation_date)
Ejemplo n.º 7
0
    def test_parse_smh_article(self):
        with open('testdata/smh_article.html', 'r') as f:
            html = f.read()

        a = article.Article(None, None, 'Sydney Morning Herald')
        a.parse(html)

        print(a.text)
Ejemplo n.º 8
0
def articleFromResult(item):
    try:
        articleDate = dateutil.parser.parse(
            item['pagemap']['metatags'][0]['article:published_time']).strftime(
                '%m-%d-%Y')
    except:
        articleDate = ""
    return article.Article(item['title'], item['link'], articleDate,
                           item['snippet'])
Ejemplo n.º 9
0
 def get_next_article(self):
     if not self.buffer:
         self._add_to_buffer()
     try:
         entry = self.buffer.pop(0)
     except IndexError:
         # we have gone through the entire file
         return None
     return article.Article(id_=entry['id'],
                            title=entry['title'],
                            text=entry['text'])
Ejemplo n.º 10
0
 def get_next_article(self):
     self.cursor.execute('SELECT title, text FROM {0} WHERE id={1}' \
                         .format(self.table, self.current_pos))
     articles = self.cursor.fetchall()
     if not articles or len(articles) != 1:
         return None
     art = article.Article(id_=self.current_pos,
                           title=articles[0][0],
                           text=articles[0][1])
     self.current_pos += 1
     return art
Ejemplo n.º 11
0
    def test_parse_bbc_article(self):
        with open('testdata/bbc_article.html', 'r') as f:
            html = f.read()

        a = article.Article(None, None, 'BBC')
        a.parse(html)

        # Make sure that some text was found
        self.assertGreater(len(a.text), 500)

        # Make sure the ads / JS functions are removed
        self.assertNotIn('/**/', a.text)
Ejemplo n.º 12
0
def marketwatch(comp_n, comp_t):

    URL = 'https://www.marketwatch.com/trading-deck/stories'
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find('ol', class_='headlines block')
    if None is results:
        exit

    comp_name = comp_n
    comp_tick = comp_t

    article_elems = results.find_all('li')

    for article_elem in article_elems:
        if None is article_elem:
            continue

        URL = 'https://www.marketwatch.com' + article_elem.find(
            'a')['href'].strip()
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, 'html.parser')

        title_elem = article_elem.find('a', class_='bighead')

        results = soup.find(id='article-body')
        if None is results:
            continue

        para_elems = results.find_all('p')
        relevant_article = False
        if (comp_name or comp_tick) in title_elem.text.strip():
            relevant_article = True
        else:
            for para_elem in para_elems:
                if None is para_elem:
                    continue
                if (comp_name or comp_tick) in para_elem.text.strip():
                    relevant_article = True
                    continue
        if relevant_article:
            body_builder = ''
            for para_elem in para_elems:
                if None is para_elem:
                    continue
                body_builder += para_elem.text.strip() + ' '
            body_builder = ' '.join(body_builder.split())
            new_article = article.Article(title_elem.text.strip(), URL,
                                          body_builder.strip())
            print(new_article, end='\n' * 2)
Ejemplo n.º 13
0
def handler(title, string):
    a = article.Article(title)
    a.fetch()

    if a.has_error():
        response.status = 400
        return

    count = a.get_string_count_in_text(string)

    response.headers['Content-Type'] = 'application/json'
    response.status = 200
    return json.dumps({'count': count})
Ejemplo n.º 14
0
def seekingalpha(comp_n, comp_t):

    URL = 'https://seekingalpha.com/market-news'
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(id='latest-news-list')
    if None is results:
        exit

    comp_name = comp_n
    comp_tick = comp_t

    article_elems = results.find_all('li', class_='item')

    for article_elem in article_elems:
        if None is article_elem:
            continue

        URL = 'https://seekingalpha.com' + article_elem.find(
            'a')['href'].strip()
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, 'html.parser')

        title_elem = article_elem.find('h4')

        results = soup.find(id='bullets_ul')
        if None is results:
            continue

        bullet_elems = results.find_all('p', class_='bullets_li')
        relevant_article = False
        if (comp_name or comp_tick) in title_elem.text.strip():
            relevant_article = True
        else:
            for bullet_elem in bullet_elems:
                if None is bullet_elem:
                    continue
                if (comp_name or comp_tick) in bullet_elem.text.strip():
                    relevant_article = True
                    continue
        if relevant_article:
            body_builder = ''
            for bullet_elem in bullet_elems:
                if None is bullet_elem:
                    continue
                body_builder += bullet_elem.text.strip() + ' '
            new_article = article.Article(title_elem.text.strip(), URL,
                                          body_builder.strip())
            print(new_article, end='\n' * 2)
Ejemplo n.º 15
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags

    # HEADLINE
    headline = soup.find('h1').string

    # TOPIC
    topic = ''
    if soup.find("a",
                 class_="navigation-wide-list__link navigation-arrow--open"):
        menuActive = soup.find(
            "a", class_="navigation-wide-list__link navigation-arrow--open")
        topic = menuActive.find("span").get_text()

    # AUTHOR
    author = ''
    if soup.find('span', class_='byline__name'):
        author = soup.find('span', class_='byline__name').get_text()

    # TEXT_BODY
    if soup.find('div', class_='story-body__inner'):
        innerArticle = soup.find('div', class_='story-body__inner')
    elif soup.find('div', class_='vxp-media__summary'):
        innerArticle = soup.find('div', class_='vxp-media__summary')
    else:
        print("no content found on" + link)
        return

    pList = innerArticle.find_all('p')

    text_body = ''
    for p in pList:
        text_body += p.get_text() + ' '

    # CREATION_DATE
    creation_date = ''

    #soup.find('div', class_='date date--v2 relative-time').get('data-datetime')
    if soup.find('div', class_='date date--v2 relative-time'):
        timeStamp = soup.find(
            'div', class_='date date--v2 relative-time').get('data-seconds')
        creation_date = datetime.fromtimestamp(timeStamp, timezone.utc)
        #creation_date = datetime.fromtimestamp(timeStamp).strftime("%A, %B %d, %Y %I:%M:%S")

    # CRAWL_DATE
    crawl_date = datetime.now()

    return article.Article(headline, link, text_body, 'https://www.bbc.com',
                           'bbc', author.replace("By ", ""), topic, crawl_date,
                           creation_date)
Ejemplo n.º 16
0
    def __init__(self, parsed, args):
        """create feed with fixed number of articles """
        logging.info('Started creting feed')

        articles_list = []
        cashed_news_number = 0
        if args.date:
            logging.info('Started extracting data from cache')
            self.link = args.source
            self.feed_name = f'Feeds from {args.source}'
            with shelve.open('cashed_feeds') as database:
                if not database:
                    raise ex.EmptyDataBase('Local feed storage is empty')
                for date in database:
                    if args.date in date and database[
                            date].feed_link == args.source:
                        articles_list.append(database[date])
                        cashed_news_number += 1
            if cashed_news_number == 0:
                raise ex.DateNotInDatabase(
                    'There is no feeds with this date and source in local storage'
                )
            logging.info('Finished extracting data from cache')

            if args.limit:
                if args.limit > cashed_news_number and args.date:
                    print(f'Only {cashed_news_number} feeds cashed')
                    number_of_articles = cashed_news_number
                elif not args.date and args.limit > len(parsed.entries):
                    print(f'Only {len(parsed.entries)} feeds cashed')
                    number_of_articles = len(parsed.entries)
                else:
                    number_of_articles = args.limit
                articles_list = articles_list[:number_of_articles]
        else:
            if args.limit:
                if args.limit > len(parsed.entries):
                    print(f'Only {len(parsed.entries)} feeds avaliable')
                    number_of_articles = len(parsed.entries)
                else:
                    number_of_articles = args.limit
            else:
                number_of_articles = len(parsed.entries)
            for i in range(number_of_articles):
                articles_list.append(
                    article.Article(parsed.entries[i], args.source))

            self.feed_name = make_string_readable(parsed.feed.title)
            self.link = parsed.feed.link
        self.articles = articles_list
Ejemplo n.º 17
0
 def update_network(self, link_list):
     """
     Adds any links or articles not currently in
     the network.
     :param link_list:
     :return:
     """
     for link in link_list:
         for title in link:
             if title not in self._articles:
                 self._articles[title] = article.Article(title)
         if self._articles[link[1]] not in self._articles[link[0]]:
             self._articles[link[0]].\
                     add_neighbor(self._articles[link[1]])
Ejemplo n.º 18
0
def get_articles():
    print("Getting articles from: " + source_url)
    html = make_request()
    soup = BeautifulSoup(html, "html.parser")
    lis = soup.find_all("ul", class_="reactiesList")

    arts = []

    for eles in lis:
        art = article.Article()
        art.title = eles.a.text
        art.source = "voetbalzone.nl"
        art.url = eles.a.get('href')
        art.full_url = root_url + art.url
        arts.append(art)
    return arts
Ejemplo n.º 19
0
    def generate_training(self, path):
        '''
        Generate training data from a list of labeled articles.
        '''
        with open(path, 'rU') as fh:
            db = csv.DictReader(fh, delimiter='\t')

        with open('data/training.txt', 'wb') as fh:
            fieldnames = ['url', 'label'] + utilities.features
            writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
            writer.writeheader()

            for i, row in enumerate(db):

                # Get url
                url = None
                if row['Identifier']:
                    url = row['Identifier']
                elif (row['Prediction'] != 'None'
                      and float(row['Confidence']) > 0.675):
                    url = row['Prediction']
                else:
                    continue
                if not url.endswith(':ocr'):
                    url += ':ocr'

                # Get label
                label = None
                for g in utilities.genres:
                    if row['Genre'] in utilities.genres[g]:
                        label = g
                        break
                if not label:
                    continue

                # If valid training instance found, create new article
                try:
                    art = article.Article(url=url)

                    # Save results
                    fields = {'label': label, 'url': url}
                    for f in utilities.features:
                        fields[f] = art.features[f]
                    writer.writerow(fields)

                except (IOError, AssertionError) as e:
                    print('Error processsing article ' + url + ': ' + repr(e))
Ejemplo n.º 20
0
def arxiv(ax_id):
    ''' Ask for arXiv identifier and return corresponding Article class. '''
    # python 3 truncates leading zeros but these might occur in arxiv identifiers
    # TODO: check!
    ax_id = str(ax_id).zfill(9)
    article_year = get_year(ax_id)
    abs_url = 'https://arxiv.org/abs/{}'.format(ax_id)
    src_abs = requests.get(abs_url)

    # obtain a _structured_ document ("tree") of source of abs_url
    page_tree = html.fromstring(src_abs.content)

    # extract title and abstract from page tree
    title = ' '.join(
        page_tree.xpath('//meta[@name="citation_title"]/@content'))
    abstract = ' '.join(
        page_tree.xpath('//meta[@property="og:description"]/@content'))
    # get main subject from page tree
    main_subject = page_tree.xpath(
        '//span [@class="primary-subject"]')[0].text_content()
    # first get all authors (formate compatible with bibtex)
    all_authors = page_tree.xpath('//meta[@name="citation_author"]/@content')
    if len(all_authors) > 1:
        authors_name = ' and '.join(all_authors)
    else:
        authors_name = all_authors[0]
    # second create a short and 'contracted' authors' name, e.g. to create file name or bibtex key
    authors_short_list = [a.split(', ')[0] for a in all_authors[:3]]
    if len(all_authors) > 3:
        authors_short = authors_short_list[0] + ' et al'
        authors_contracted = authors_short_list[0] + 'EtAl'
    elif 1 < len(all_authors) <= 3:
        authors_short = ', '.join(authors_short_list[:-1])
        authors_short += ' and ' + authors_short_list[-1]
        authors_contracted = ''.join(authors_short_list)
    else:
        authors_short = authors_short_list[0]  # TODO: IMPROVE!?!?
        authors_contracted = authors_short

    return article.Article(title=title,
                           authors=authors_name,
                           authors_short=authors_short,
                           authors_contracted=authors_contracted,
                           abstract=abstract,
                           ax_id=ax_id,
                           year=article_year,
                           main_subject=main_subject)
Ejemplo n.º 21
0
def get_articles():
    print("Getting articles from: " + source_url)
    html = make_request()
    soup = BeautifulSoup(html, "html.parser")
    lis = soup.find_all("div", class_="item")

    arts = []

    for eles in lis:
        print()
        art = article.Article()
        art.title = eles.a.get("title")
        art.url = eles.a.get("href")
        art.full_url = root_url + art.url
        art.source = source_url
        arts.append(art)
    return arts
Ejemplo n.º 22
0
def get_article_list():
    url = "http://www.chong4.com.cn"
    response = urllib.request.urlopen(url)
    line = response.read()
    soup = BeautifulSoup(line, "html.parser")
    nodes = soup.select(".textbox-title a")

    for node in nodes:
        ar = article.Article()
        ar.title = node.string
        ar.link = url + node.get("href")
        ar.id = ar.link.split("?")[1]
        re.hmset(ar.id, ar.__dict__)
        get_article_detail(ar.link, ar.id)

    re.save()
    return "ok"
Ejemplo n.º 23
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags

    # CREATION_DATE
    creation_date = '' if soup.find(
        'span', class_='stand') is None else soup.find('span',
                                                       class_='stand').string

    # HEADLINE
    dachzeile = '' if soup.find('span',
                                class_='dachzeile') is None else soup.find(
                                    'span', class_='dachzeile').string
    title = '' if soup.find('span', class_='headline') is None else soup.find(
        'span', class_='headline').string
    headline = dachzeile + ' - ' + title

    # TOPIC
    topic = '' if len(link.split("/")) < 3 else link.split("/")[3]

    # AUTHOR
    author = '' if soup.find('p',
                             class_='autorenzeile') is None else soup.find(
                                 'p', class_='autorenzeile').string
    author = '' if author is None else author.replace("Von ", "")
    author = '' if author is None else author.replace(",", "")

    # TEXT_BODY
    text_body = ''
    text_body_tag = soup.find_all('p', 'text')
    for ptag in text_body_tag:
        text_body = text_body + ptag.get_text()
    if text_body is not '':
        text_body = ' '.join(text_body.split())
        text_body = text_body.replace(
            creation_date, "")  # entfernt den Zeitstempel aus dem Text

    # CLEAN TIME
    creation_date = creation_date.replace("Stand: ", "")
    creation_date = creation_date.replace(" Uhr", "")

    return article.Article(headline, link, text_body,
                           'https://www.tagesschau.de', 'tagesschau', author,
                           topic, date.today(), creation_date)
Ejemplo n.º 24
0
 def __get_all_article(self):
     articles = []
     links = list(map(lambda x: re.sub("http://\\w+\\.10jqka.com\\.cn/",
                                       self.__m_url, x.get('href')), self.__soup.select(".arc-title > a")))
     i = 0
     while i < len(links):
         r = requests.get(links[i], headers=headers, allow_redirects=False)
         if r.status_code == 403:
             print("Too fast, Forbidden!")
             time.sleep(1234)
             continue
         try:
             articles.append(article.Article(self.type, r.content, r.encoding))
         except IndexError:
             pass
         finally:
             time.sleep(random.uniform(0.57, 1.08))
             i += 1
     return articles
Ejemplo n.º 25
0
    def _init_article_(self, next_file, article_meta, body):
        id_ = self._get_ids(article_meta)
        pmc_tag = self._get_PMC_ids(article_meta)
        pmid = self._get_ids(article_meta)
        title = self._get_title(article_meta)
        try:
            temp = article_meta.find('abstract')
            if (temp is None):
                abstract = []
            else:
                abstract_sections = self._get_sections(temp)
                abstract = []
                for part in abstract_sections:
                    abstract.append([part[0], part[1]])
        except:
            lop = article_meta.find('abstract').findall('p')
            abstract = reduce(
                (lambda x, y: ''.join([x, ET.tostring(y).decode('utf-8')])),
                lop, "")
            if abstract == '':
                abstract = ET.tostring(
                    article_meta.find('abstract')).decode('utf-8')

        if not (body is None):
            text = self._get_sections(body)  #self._get_full_text(body)
            text.insert(0, ['Abstract', abstract])
        else:
            text = [['Abstract', abstract]]

        # store the path of this file
        art = article.Article(id_=id_, title=title, text=text)
        art.get_extra()['path'] = next_file
        art.get_extra()['PMC'] = pmc_tag
        text.insert(1, [
            "Title",
            [['Article Title', title], ['PubMed Id', pmid], ['PMC', pmc_tag]]
        ])

        # only get the abstract if the next_file is None or it doesn't exist
        if (not (abstract is None) and not (next_file is None)):
            art.get_extra()['abstract'] = abstract  # add the abstract in

        return art
Ejemplo n.º 26
0
    def test_article_serialization(self):
        # Given
        name1 = 'mlotek'
        name2 = 'hammer'
        obj = article.Article('1', [name1, name2], 2, 2, True)
        expected = '''
        {
            "id": "1",
            "is_available": true,
            "name": ["mlotek", "hammer"],
            "quantity": 2,
            "total_quantity": 2
        }
        '''

        # When
        actual = str(obj)

        # Then
        self.assertEqual(''.join(expected.split()), ''.join(actual.split()))
Ejemplo n.º 27
0
    def test_article_serialization2(self):
        # Given
        name1 = ''
        name2 = 'hammer'
        obj = article.Article('4', [name1, name2], 6, 5, False)
        expected = '''
        {
            "id": "4",
            "is_available": false,
            "name": ["", "hammer"],
            "quantity": 5,
            "total_quantity": 6
        }
        '''

        # When
        actual = str(obj)

        # Then
        self.assertEqual(''.join(expected.split()), ''.join(actual.split()))
Ejemplo n.º 28
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags

    # HEADLINE
    headline = ''
    hl = soup.find('div', class_='artTitle')
    if hl is not None:
        headline = hl.get_text().strip()
    else:
        print(link)

    # TOPIC
    topic = ''
    t = soup.find("div", class_="sciezka")
    if t is not None:
        topic = t.findAll("a")[1].get_text()
    # AUTHOR
    author = ''

    # CREATION_DATE
    creation_date = ''
    d = soup.find('div', class_='artDate')
    if d is not None:
        creation_date = d.get_text().strip()

    # TEXT_BODY
    text_body = ''
    tb = soup.find('div', class_='artFull')

    if tb is None:
        print(link)

    for div in tb.find_all('div'):
        div.clear()

    text_body = tb.get_text().strip()

    return article.Article(headline, link, text_body,
                           'http://www.warsawvoice.pl', 'warsawvoice', author,
                           topic, date.today(), creation_date)
Ejemplo n.º 29
0
def scrape(link):
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    [s.extract() for s in soup('script')]  # entfernt alle script tags

    # HEADLINE
    headline = soup.find('h1').string
    #headline = soup.find('div', class_='title').string

    # TOPIC
    topic = ''
    #if len(soup.find_all('div', class_='breadcrumbs')) > 0:
    #topic = soup.find_all('div', class_='breadcrumbs')[0].find_all('a')[1].get('title')
    #topic = soup.find("a", class_="active").get_text()

    # AUTHOR
    author = ''
    if soup.find('meta', itemprop='name'):
        author = soup.find('meta', itemprop='name').get('content')

    # TEXT_BODY
    text_body = soup.find_all('div', 'gl_plugin article')[0].get_text()
    text_body = ' '.join(text_body.split())

    # CREATION_DATE
    creation_date = ''
    #if soup.find('time'):
    #    creation_date = soup.find('time').get('datetime')

    if soup.find('meta', itemprop='datePublished'):
        creation_date = soup.find('meta',
                                  itemprop='datePublished').get('content')

    # CRAWL_DATE
    crawl_date = datetime.now()

    return article.Article(headline, link, text_body, 'https://www.se.pl',
                           'super-express', author, topic, crawl_date,
                           creation_date)
Ejemplo n.º 30
0
    def _read_rss(self):
        """ Fetch and parse the site's RSS page
        
        Returns a list of articles to check
        """
        d = feedparser.parse(self.rss_url)

        data = {
            'title': d.feed.get("title"),
            'published': d.feed.get("published_parsed"),
            'updated': d.feed.get("updated_parsed")
        }

        # Create articles from the RSS feed
        for entry in d.entries:
            article_data = {
                'published_time': entry['published_parsed'],
                'title': entry['title'],
                'summary': entry['summary'],
                'url': entry['link'],
                'host_site': self.name
            }
            a = article.Article(**article_data)
            self.articles.append(a)