Esempio n. 1
0
def addNewsArticle(companyModel,newsSourceModel,url):
    tree = html.fromstring(requests.get(url).text)
    
    # Need the article title
    title = tree.xpath('//h1/text()')[0]

    # Need the text of the article
    textElements = tree.xpath("//*[@id='articleText']//*/text()")
    text = ""
    for t in textElements:
        text += t

    # Need the date field
    dateText = tree.xpath("//div[@id='articleInfo']//span[@class='timestamp']/text()")[0]
    date = datetime.datetime.strptime(dateText[:-4], "%a %b %d, %Y %I:%M%p").date()

    News.create(
        company=companyModel,
        newsSource=newsSourceModel,
        title=title,
        text=text,
        url=url,
        date = date
    )
    print "Added: "+title
Esempio n. 2
0
 def post(self, *args):
     cat_id = args[0]
     news_title = self.get_argument("news-title")
     news_body = self.get_argument("news-body")
     news_date = self.get_argument("news-date")
     au_id = self.GetAuthorIdByEmail(self.current_user)
     News.create(
         title=news_title,
         body=news_body,
         date=news_date,
         author=au_id,
         category=cat_id,
     )
     self.write("Item saved...")
     self.redirect("/category")
Esempio n. 3
0
def get_news():
    # Initialize the list
    results = []

    # Iterate each keyword of the list
    for keyword in keywords:
        start = random.randrange(0, 180, 10)
        # Asign to the URL the keyword to search
        URL = "https://google.com/search?q=" + keyword + "&start=" + str(start)
        search = requests.get(URL, headers=HEADERS)

        # If the search has a 200 status we parse the content found
        if search.status_code == 200:
            response = BeautifulSoup(search.content, "html.parser")

            # In the response we find the common class rc and their anchors
            for new in response.find_all('div', class_='rc', limit=3):
                anchors = new.find_all('a')

                if anchors:
                    # Get the content of anchor: href
                    reference = anchors[0]['href']

                    # Search in database if the reference is current saved
                    find_reference = News.query.filter_by(
                        reference=reference).first()

                    # If the reference is not saved in database we extract the data of the search
                    if not find_reference:

                        # Get the text of the results, like title and excerpt of the post/article according to specific tags and class
                        title = new.find('h3').text
                        excerpt = new.find('span', class_='st').text
                        news = {
                            "keyword": keyword,
                            "title": title,
                            "reference": reference,
                            "excerpt": excerpt
                        }

                        # Join content of the excerpt and title, and pass to lowercase to ensure a better count of frequency words
                        full_text = news['excerpt'].lower(
                        ) + news['title'].lower()
                        text = full_text.split(" ")
                        FreqDistBody = FreqDist(text)
                        frequency = FreqDistBody[keyword]

                        # Adding a new key value pair to news
                        news.update({'frequency': frequency})

                        results.append(news)

                        # Store in database the results of the search
                        new = News.create(news['title'], news['excerpt'],
                                          news['reference'], news['keyword'],
                                          news['frequency'])

    return jsonify(sorted(results, key=lambda k: k['frequency'], reverse=True))