Exemple #1
0
def fetch_and_save_articles_in_date_range(from_date, to_date):
    # Fetch raw articles
    raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'),
                                               to_date.strftime('%Y-%m-%d'))

    # Build articles and insert into database
    articles = Article.build_articles(raw_articles)
    # Article.bulk_insert(articles)

    # Store raw article content in datasets for later analysis
    df = pd.DataFrame.from_records([
        {
            'article_title': x.title,
            'article_uuid': x.article_uuid,
            'article_url': x.url,
            'article_description': x.description,
            'source_id': x.source_id,
            'published_at': x.published_at,
            'named_entities': x.named_entities,
            'raw_content': x.raw_content,
        } for x in articles
        if x.title is not None and x.description is not None
    ]).drop_duplicates(subset='article_url').reset_index(drop=True)
    tmp = tempfile.NamedTemporaryFile()
    with open(tmp.name, 'w') as f:
        df.to_csv(tmp.name, sep='\t', encoding='utf-8', index=False)
    date_str = from_date.strftime('%Y-%m-%d')
    datasets.put(tmp.name, f'/input/article_content/{date_str}.csv')
Exemple #2
0
def fetch_articles_for_date(date_to_parse):
    from_date = date_to_parse
    to_date = date_to_parse + timedelta(days=1)

    # Fetch raw articles
    raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'),
                                               to_date.strftime('%Y-%m-%d'))

    # Build articles and insert into database
    articles = Article.build_articles(raw_articles)

    return [
        a for a in articles
        if a.published_at is not None and len(a.named_entities) > 1
    ]
from faculty import datasets
from datetime import datetime, timedelta

from models.article import Article
from models.raw_article import RawArticle

# Setup dates
from_date = datetime.today().date() - timedelta(days=1)
to_date = datetime.today().date()

# Fetch raw articles
raw_articles = RawArticle.get_raw_articles(from_date.strftime('%Y-%m-%d'),
                                           to_date.strftime('%Y-%m-%d'))

# Build articles and insert into database
articles = Article.build_articles(raw_articles)
# Article.bulk_insert(articles)

# Store raw article content in datasets for later analysis
df = pd.DataFrame.from_records([
    {
        'article_title': x.title,
        'article_uuid': x.article_uuid,
        'article_url': x.url,
        'article_description': x.description,
        'source_id': x.source_id,
        'published_at': x.published_at,
        'named_entities': x.named_entities,
        'raw_content': x.raw_content,
    } for x in articles if x.title is not None and x.description is not None
]).drop_duplicates(subset='article_url').reset_index(drop=True)