Example #1
0
 def create_article_obj(self, url, feed_id, feed_content=None):
     if feed_content is not None:
         parsed_text = self.parse_feed_text(feed_content)
         article_obj = Article(url, feed_id, article_text=parsed_text)
     else:
         article_obj = Article(url, feed_id)
     return article_obj
Example #2
0
    def setup_test(self):
        '''
        Make ourselves a way to quickly setup articles storage.
        '''

        self.article_0 = Article(0, "Some title 0", "2016-09-22",
                                 "Some body text 0",
                                 ['tag0', 'tag1', 'tag2', 'tag3'])

        self.article_1 = Article(1, "Some title 1", "2016-09-22",
                                 "Some body text 1",
                                 ['tag0', 'tag4', 'tag5', 'tag6'])

        self.article_2 = Article(2, "Some title 2", "2016-09-23",
                                 "Some body text 2",
                                 ['tag0', 'tag1', 'tag2', 'tag3'])

        self.article_3 = Article(3, "Some title 3", "2016-09-23",
                                 "Some body text 3",
                                 ['tag0', 'tag1', 'tag2', 'tag3'])

        self.article_4 = Article(4, "Some title 4", "2016-09-23",
                                 "Some body text 4",
                                 ['tag0', 'tag1', 'tag2', 'tag3'])

        self.articles = Articles()
Example #3
0
def get_article_full(alias):
    '''
    Return full article contents.
    '''
    db = get_db()
    article = Article(alias)
    article.load_all_data(db)
    db.commit()
    return article
Example #4
0
def article(id):
    if not id or id == "undefined":
        return url_for("index")
    else:
        try:
            article = Article(os.path.join(ARTICLES_PATH, id + ".json"))
        except FileNotFoundError:
            return redirect(url_for("index"))
        return render_template("article.html", article=article)
Example #5
0
def find_translation(article, lang, cur):
    src_url = find_translation_url(article, lang)
    if not src_url: return None
    m = id_pattern.match(src_url)
    if m:
        aid = int(m.group(1))
        cur.execute('select * from articles where id = ?', (aid, ))
    else:
        cur.execute('select * from articles where url = ?', (src_url, ))
    row = cur.fetchone()
    if row is not None:
        return Article(*row)
Example #6
0
    def post(self):
        '''Endpoint POST /articles should handle the receipt of some article 
        data in json format, and store it within the service.
        '''
        args = self.parser.parse_args()

        self.abort_if_article_aleady_exist(args['id'])

        article = Article(args['id'], args['title'], args['date'],
                          args['body'], args['tags'])

        result = self.storage.add(article)
        return jsonpickle.encode(result, unpicklable=False)
Example #7
0
def main():
    parser = argparse.ArgumentParser(description='Dump all text')
    parser.add_argument('database', help='database to read articles from')
    parser.add_argument('lang', help='language to get articles for')
    args = parser.parse_args()

    conn = sqlite3.connect(args.database)
    cur = conn.cursor()

    cur.execute('select * from articles where lang = ?', (args.lang,))

    for article in cur.fetchall():
        article = Article(*article)
        print(article.entry.encode('utf8'))
Example #8
0
def main(filename):
    Base.metadata.create_all(engine)
    session = Session()
    articles = pd.read_csv(filename)

    for index, row in articles.iterrows():
        logger.info("Loading article into DB")
        article = Article(row["uid"], row["body"], row["host"],
                          row["newspaper_uid"], row["n_tokens_title"],
                          row["title"], row["url"])

        session.add(article)
    session.commit()
    session.close()
Example #9
0
def generate_article():
    keywords = request.form.get("topic")
    if keywords == None:
        return render_template("home.html")
    else:
        keywords = keywords.split(" ")
        kwords = []
        for word in keywords:
            kwords.append(word.lower())
        keywords = kwords

        articles = []
        for file in os.listdir("articles/"):
            if file.endswith(".txt"):
                text = open(os.path.join("articles/", file), "r").read()
                source = file[:file.index("-")]
                articles.append(Article(text, source))
        weighted_articles = []
        for art in articles:
            weighted_articles.append((similarity(art.vector, keywords), art))
        weighted_articles = sorted(weighted_articles, key=lambda x: -x[0])
        temp = []
        for pair in weighted_articles:
            if pair[0] > 0:
                temp.append(pair)
        weighted_articles = temp
        if len(weighted_articles) >= 3:
            model = weighted_articles[0:3]
        else:
            model = weighted_articles
        articles = []
        for pair in model:
            art = pair[1]
            articles.append(art)
        generated_article, sources = group_sentences(articles)
        title = ""
        art_text = ""
        for sentence in generated_article:
            art_text += sentence[0] + " "
        if len(generated_article) > 0:
            title = create_title(art_text)
        else:
            title = "Sorry, we couldn't find any related articles!"
        #generate the text and display some how
        tit_text = title.decode('utf8')
        art_text = art_text.decode('utf8')
        return render_template("home.html", title=tit_text, article=art_text)
 def __update_category_articles(self, category):
   response = urllib2.urlopen(self.url + category, timeout=20)
   js_data = response.read().decode("ISO-8859-1").replace("\\'", "'").replace(",\n]", "]")
   for article_data in self.json_decoder.decode(js_data):
     #get the image
     img_data = None
     if article_data['image_url'] != '' and self.manager.find(article_data['id']) is None:
       try:
         img_response = urllib2.urlopen(article_data['image_url'], timeout=20)
         img_data = buffer(img_response.read())
       except: pass
     article = Article(article_id=article_data['id'],
       category=category,
       title=article_data['title'],
       timestamp=article_data['timestamp'],
       body=article_data['body'],
       image=img_data,
       read=0)
     self.manager.save(article)
Example #11
0
from utils import vectorize_text
from utils import split_sentences
from utils import self_correlate
from utils import bow_caps
from utils import average_sentiment
from utils import words_sentiment
from utils import create_title

CUTOFF = 0.5

articles = []
for file in os.listdir("articles/"):
    if file.endswith(".txt"):
        text = open(os.path.join("articles/", file), "r").read()
        source = file[:file.index("-")]
        articles.append(Article(text, source))
groups = cluster(articles)
num_groups = max(groups) + 1
groupings = [[] for _ in range(num_groups)]
for i in range(len(groups)):
    group_num = groups[i]
    article = articles[i]
    groupings[group_num].append(article)

correlations = []
for group in groupings:
    correlations.append((average_relation(group), len(group)))

i = len(groupings) - 1
while i >= 0:
    group = groupings[i]
def parseArticleHtml(articleListFilePath):
    '''
    @summary: Parse the html page for each article by using BeautifufSoup and save them into JSON format.
    @return: Return a list of Article objects that contain details of each article.
    '''
    
    articleList = None
    with open(articleListFilePath) as data_file:    
        articleList = json.load(data_file)

    results = set()
    for i in xrange(len(articleList)):    
        article = Article()
        
        article = jsonhelper.simple_dict_to_object(articleList[i], article)
                        
        fileName = article.link[20:].replace('/', '-',3).replace('/', '') + '.html'
        #fileName = '2016-02-23-teens-marijuana-photos.html'
        filePath = constants.ArticleHtmlDir + fileName
        
        #filePath = constants.ArticleHtmlDir + '2016-02-18-amazon-studios-picks-up-untitled-woody-allen-movie.html'
                
        # parse html if file exists
        if(os.path.isfile(filePath)):
            articleFile = open(filePath)

            try: 
                bs = BeautifulSoup(articleFile, 'html.parser')
            
                # get total share
                shareNode = bs.find('div', {'class': 'total-shares'})
                if(shareNode):
                    article.shares = shareNode.get_text().replace('\n', '').replace('Shares', '')
                else:
                    shareNode = bs.find(lambda tag: tag.has_attr('data-shares'))
                    article.shares = shareNode.get('data-shares')
                    
                if(article.shares.endswith('k')):
                    article.shares = int(float(article.shares[:-1]) * 1000)
                
                article.shares = int(article.shares)

                # Get Number of Links
                links = bs.find_all('a')
                article.num_hrefs =  len(links)

                # Get links to other articles
                otherArticleLinks = bs.find_all('a', {'href': lambda value: value and re.search('mashable.com/\d{4}/\d{2}/\d{2}/', value)})
                article.num_self_hrefs = len(otherArticleLinks)

                # Get content tag
                contentTag = bs.find('section', {'class': lambda value: value and value.startswith('article-content')})
                                
                #video type article is different
                if(not contentTag):
                    contentTag = bs.find('section', {'id': 'video-matting'})
                    
                # now another type, seems post photos
                if(not contentTag):
                    contentTag = bs.find('div', {'id': 'skrollr-body'})
                    
                #also some article in iframe
                if(not contentTag): 
                    iframeDivTag = bs.find(lambda tag: tag.has_attr('data-url'))
                    
                    if(iframeDivTag):
                        iframeUrl = iframeDivTag.get('data-url')
                        
                        res = requests.get(iframeUrl)
                        iframeContent = res.text
                        
                        bsIframe = BeautifulSoup(iframeContent, 'html.parser')
                        contentTag = bsIframe.find('div', {'id': 'content'})

                # Get number of images in the article
                images = contentTag.find_all('img')
                if(images):
                    article.num_imgs = len(images)

                # Get number of videos in the article
                youtubeVideos = contentTag.find_all(lambda tag: tag.has_attr('src') and 'youtube.com' in tag.get('src'))
                ooyalaVideos = contentTag.find_all(lambda tag: tag.has_attr('data-video'))
                article.num_videos = len(youtubeVideos) + len(ooyalaVideos)
                
                # get topics
                footerTopicsTag = bs.find('footer', {'class': 'article-topics'})
                
                if(footerTopicsTag):
                    article.topics = footerTopicsTag.get_text().replace("Topics:", "").replace("\n", "")
                else:
                    # assume it is from iframe if not found in footer
                    jsTag  = bs.find("script", {'type': 'application/ld+json'})
                    scriptContent = jsTag.get_text()
                    dic = json.loads(scriptContent.decode('utf-8'))
                    #print dic
                    #dic = ast.literal_eval(scriptContent)
                    article.topics = dic['metadata']['omnitureData']['topics']

                # get Days between the article publication and the dataset created
                post_date = datetime.strptime(article.post_date[0:19], '%Y-%m-%dT%H:%M:%S')                 
                article.timedelta = (datetime.now() - post_date).days
                
                # get number of keywords from meta in head
                keywords = bs.head.find('meta', {'name': 'keywords'}).get('content')
                #print 'keywords: ' + keywords
                article.num_keywords = len(keywords.split(','))

                contentBlob = TextBlob(article.content)
                                
                # Number of words in the content
                article.n_tokens_content = len(contentBlob.words)
                
                # article sentiment
                article.content_sentiment_polarity = contentBlob.sentiment.polarity
                article.content_subjectivity = contentBlob.sentiment.subjectivity
                

                titleBlob = TextBlob(article.title)

                # Number of words in the title
                article.n_tokens_title = len(titleBlob.words)

                # title sentiment
                article.title_sentiment_polarity = titleBlob.sentiment.polarity
                article.title_subjectivity = titleBlob.sentiment.subjectivity
                
                #results.add(article.to_dict())
                results.add(article)
                
                #print article
                print 'Parsed: ' + fileName
            except Exception as ex: 
                print 'Error in: ', fileName
                traceback.print_exc()
            finally:
                articleFile.close()
        else:
            print 'File not found: ' + fileName
         
        '''
        i += 1
        if(i > 3):   
            break
        '''
            
    return results
Example #13
0
 def article_pairs(trg_lang, src_lang):
     trg_cur.execute('select * from articles where lang = ?', (trg_lang, ))
     for article in trg_cur.fetchall():
         trg_article = Article(*article)
         src_article = find_translation(trg_article, src_lang, src_cur)
         yield trg_article, src_article
Example #14
0
#!/usr/bin/env python
from clustering import get_clusters
from articles import Article


def accuracy(actual_good, actual_bad, predicted_good, predicted_bad):
    bad_count = 0
    for bad in predicted_bad:
        if bad in actual_bad:
            bad_count += 1
    good_count = 0
    for good in predicted_good:
        if good in actual_good:
            good_count += 1
    return


if __name__ == '__main__':
    # articles_good = Article.read_full('./atrinkti_saulius/geri_straipsniai')
    # articles_bad = Article.read_full('./atrinkti_saulius/blogi_straipsniai')
    articles_good = Article.read_full('./atrinkti_ginte/geri')
    articles_bad = Article.read_full('./atrinkti_ginte/blogi')
    articles = articles_good + articles_bad
    clusters = get_clusters(articles)
    import pdb
    pdb.set_trace()
Example #15
0
def get_articles():
    articles = []
    for article_json in os.listdir(ARTICLES_PATH):
        articles.append(Article(os.path.join(ARTICLES_PATH, article_json)))
    active_articles = [article for article in articles if article.active]
    return active_articles
Example #16
0
def parseArticleHtml(articleListFilePath):
    '''
    @summary: Parse the html page for each article by using BeautifufSoup and save them into JSON format.
    @return: Return a list of Article objects that contain details of each article.
    '''

    articleList = None
    with open(articleListFilePath) as data_file:
        articleList = json.load(data_file)

    results = set()
    for i in xrange(len(articleList)):
        article = Article()

        article = jsonhelper.simple_dict_to_object(articleList[i], article)

        fileName = article.link[20:].replace('/', '-', 3).replace('/',
                                                                  '') + '.html'
        #fileName = '2016-02-23-teens-marijuana-photos.html'
        filePath = constants.ArticleHtmlDir + fileName

        #filePath = constants.ArticleHtmlDir + '2016-02-18-amazon-studios-picks-up-untitled-woody-allen-movie.html'

        # parse html if file exists
        if (os.path.isfile(filePath)):
            articleFile = open(filePath)

            try:
                bs = BeautifulSoup(articleFile, 'html.parser')

                # get total share
                shareNode = bs.find('div', {'class': 'total-shares'})
                if (shareNode):
                    article.shares = shareNode.get_text().replace(
                        '\n', '').replace('Shares', '')
                else:
                    shareNode = bs.find(
                        lambda tag: tag.has_attr('data-shares'))
                    article.shares = shareNode.get('data-shares')

                if (article.shares.endswith('k')):
                    article.shares = int(float(article.shares[:-1]) * 1000)

                article.shares = int(article.shares)

                # Get Number of Links
                links = bs.find_all('a')
                article.num_hrefs = len(links)

                # Get links to other articles
                otherArticleLinks = bs.find_all(
                    'a', {
                        'href':
                        lambda value: value and re.search(
                            'mashable.com/\d{4}/\d{2}/\d{2}/', value)
                    })
                article.num_self_hrefs = len(otherArticleLinks)

                # Get content tag
                contentTag = bs.find(
                    'section', {
                        'class':
                        lambda value: value and value.startswith(
                            'article-content')
                    })

                #video type article is different
                if (not contentTag):
                    contentTag = bs.find('section', {'id': 'video-matting'})

                # now another type, seems post photos
                if (not contentTag):
                    contentTag = bs.find('div', {'id': 'skrollr-body'})

                #also some article in iframe
                if (not contentTag):
                    iframeDivTag = bs.find(
                        lambda tag: tag.has_attr('data-url'))

                    if (iframeDivTag):
                        iframeUrl = iframeDivTag.get('data-url')

                        res = requests.get(iframeUrl)
                        iframeContent = res.text

                        bsIframe = BeautifulSoup(iframeContent, 'html.parser')
                        contentTag = bsIframe.find('div', {'id': 'content'})

                # Get number of images in the article
                images = contentTag.find_all('img')
                if (images):
                    article.num_imgs = len(images)

                # Get number of videos in the article
                youtubeVideos = contentTag.find_all(lambda tag: tag.has_attr(
                    'src') and 'youtube.com' in tag.get('src'))
                ooyalaVideos = contentTag.find_all(
                    lambda tag: tag.has_attr('data-video'))
                article.num_videos = len(youtubeVideos) + len(ooyalaVideos)

                # get topics
                footerTopicsTag = bs.find('footer',
                                          {'class': 'article-topics'})

                if (footerTopicsTag):
                    article.topics = footerTopicsTag.get_text().replace(
                        "Topics:", "").replace("\n", "")
                else:
                    # assume it is from iframe if not found in footer
                    jsTag = bs.find("script", {'type': 'application/ld+json'})
                    scriptContent = jsTag.get_text()
                    dic = json.loads(scriptContent.decode('utf-8'))
                    #print dic
                    #dic = ast.literal_eval(scriptContent)
                    article.topics = dic['metadata']['omnitureData']['topics']

                # get Days between the article publication and the dataset created
                post_date = datetime.strptime(article.post_date[0:19],
                                              '%Y-%m-%dT%H:%M:%S')
                article.timedelta = (datetime.now() - post_date).days

                # get number of keywords from meta in head
                keywords = bs.head.find('meta', {
                    'name': 'keywords'
                }).get('content')
                #print 'keywords: ' + keywords
                article.num_keywords = len(keywords.split(','))

                contentBlob = TextBlob(article.content)

                # Number of words in the content
                article.n_tokens_content = len(contentBlob.words)

                # article sentiment
                article.content_sentiment_polarity = contentBlob.sentiment.polarity
                article.content_subjectivity = contentBlob.sentiment.subjectivity

                titleBlob = TextBlob(article.title)

                # Number of words in the title
                article.n_tokens_title = len(titleBlob.words)

                # title sentiment
                article.title_sentiment_polarity = titleBlob.sentiment.polarity
                article.title_subjectivity = titleBlob.sentiment.subjectivity

                #results.add(article.to_dict())
                results.add(article)

                #print article
                print 'Parsed: ' + fileName
            except Exception as ex:
                print 'Error in: ', fileName
                traceback.print_exc()
            finally:
                articleFile.close()
        else:
            print 'File not found: ' + fileName
        '''
        i += 1
        if(i > 3):   
            break
        '''

    return results