Esempio n. 1
0
def main():
    if not config_found:
        print('config.py file must be present')
        return

    print('config.py found')
    create_publications()
    if len(sys.argv) < 2 and config.fetching_config['publication'] is None:
        print('You need to pass publication_name as argv. For example:')
        print('python fetch_wordpress_articles.py "CNN esp"')
        return

    if len(sys.argv) == 2:
        pub_name = sys.argv[1]
    else:
        pub_name = config.fetching_config['publication']

    pub = Publication.objects(name=pub_name).get()
    print(f'url ro fetch: {pub.api_url}')
    if pub == 'iProfesional':
        art_to_db = iProfesional_to_db
        get_url = get_iProfesional_url
        get_articles = get_iProfesional_articles
    else:
        art_to_db = wordpress_to_db
        get_url = get_wp_url
        get_articles = get_wp_articles
    fetch_articles(pub_name,
                   art_to_db=art_to_db,
                   get_url=get_url,
                   get_articles=get_articles,
                   api_url=pub.api_url,
                   date_after=config.fetching_config['date_after'],
                   date_before=config.fetching_config['date_before'])
Esempio n. 2
0
def get_articles():
    google = int(flask.request.args.get('google') or 0)
    aws = int(flask.request.args.get('aws') or 0)
    azure = int(flask.request.args.get('azure') or 0)
    selected_pub = (
        flask.request.args.get('pub')
        or Publication.objects(name=config.active_publication).first().id)
    page_num = int(flask.request.args.get('page') or 1)
    per_page = int(flask.request.args.get('count') or 10)
    from_date = flask.request.args.get('from') or '2000-01-01'
    to_date = flask.request.args.get('to') or datetime.now().strftime(
        "%Y-%m-%d")

    #     if to_date is None:
    #         to_date = datetime.now().strftime("%Y-%m-%d")

    #     if from_date is None:
    #         from_date = '2000-01-01'

    cloud_args = {}
    if google == 1:
        cloud_args['ner_google_id__ne'] = None
    if aws == 1:
        cloud_args['ner_aws_id__ne'] = None
    if azure == 1:
        cloud_args['ner_azure_id__ne'] = None

    print(datetime.strptime(from_date, "%Y-%m-%d").date())

    articles_page = Article.objects(
        publication=selected_pub,
        **cloud_args,
        publish_date__gte=datetime.strptime(from_date, "%Y-%m-%d").date(),
        publish_date__lte=datetime.strptime(
            to_date, "%Y-%m-%d").date()).order_by('-publish_date').paginate(
                page=page_num, per_page=per_page)

    pubs = []
    for pub in Publication.objects():
        if selected_pub == str(pub.id):
            selected = 'selected'
        else:
            selected = ''
        pubs.append([pub, selected])
    return articles_page, pubs, cloud_args, from_date, to_date
Esempio n. 3
0
def create_publications():
    connect(config.database['db_name'], host=config.database['host'], port=config.database['port'])    
    print('Verificando publicaciones')
    for pub_dict in config.publications:
        pub_list = Publication.objects(name=pub_dict.get('name'))
        if len(pub_list) == 0:
            new_pub = Publication(name=pub_dict.get('name'), url=pub_dict.get('url'), location=pub_dict.get('location'), 
            fetch_method=pub_dict.get('fetch_method'), api_url=pub_dict.get('api_url'))
            new_pub.save()
            print(f'Publication creada: {pub_dict.get("name")}')
        else:
            exitent_pub = pub_list.get()
            exitent_pub.name = pub_dict.get('name')
            exitent_pub.url = pub_dict.get('url')
            exitent_pub.location = pub_dict.get('location')
            exitent_pub.fetch_method = pub_dict.get('fetch_method')
            exitent_pub.api_url = pub_dict.get('api_url')
            exitent_pub.save()
            print(f'Publication Modificada: {pub_dict.get("name")}')
    pubs = Publication.objects()
    print(f'Total de publicaciones en la db: {len(pubs)}')
    for p in pubs:
        print(f'- {p.name}')
    print('#################################')
Esempio n. 4
0
def get_article_by_cms_id(publication_name, cms_id):
    """
    Gets article from publication API by cmd_id
  
    Gets an article from publication API (like wordpress for example). At the moment only support wordpress 
  
    Parameters: 
    publication_name (str): name of publication 
    cms_id (str): id of article to fetch 
  
    Returns: 
    json: article as JSON
    """
    api_url = Publication.objects(name=publication_name).get()['api_url']
    url_by_ids = get_wp_url_by_ids(api_url, [cms_id])
    response = requests.get(url_by_ids)
    return response.json()[0]
Esempio n. 5
0
def wordpress_to_db(art, publication_name):
    """
    Converts wordpress data to mongodb format
  
    Parameters: 
    art (json): wordpress article as JSON
    publication_name (str): name of publication
  
    Returns: 
    json: article as Article instance
    """
    publication = Publication.objects(name=publication_name).get()
    query = Article.objects(
        Q(url=art['link'])
        | Q(publication=publication, pub_art_id=str(art['id'])))
    if len(query) > 0:
        # Duplicate!!!
        return None

    if type(art['content']['rendered']) == bool:
        print('Article with no content')
        return
    article = Article()
    article.title = art['title']['rendered']
    article.summary = art['excerpt']['rendered']
    article.text = art['content']['rendered']
    article.publish_date = datetime.datetime.strptime(art['date'],
                                                      "%Y-%m-%dT%H:%M:%S")
    article.url = art['link']
    if type(art['author']) == str:
        article.author = [art['author']]
    elif type(art['author']) == list:
        article.author = art['author']
    elif type(art['author']) == int:
        article.author = [str(art['author'])]
    else:
        print('author error')
    # article.keywords = art['keywords']
    # article.categories = art['title']
    article.publication = publication
    publication_id = str(art['id'])
    article.pub_art_id = publication_id
    return article
Esempio n. 6
0
def iProfesional_to_db(art, publication_name='iProfesional'):
    """
    Converts iProfesional data to mongodb format
  
    Parameters: 
    art (json): iProfesional article as JSON
    publication_name (str): should be iProfesional
  
    Returns: 
    json: article as Article instance
    """
    publication = Publication.objects(name=publication_name).get()
    publication_id = str(art['id'])
    query = Article.objects(
        Q(url=art['absoluteUrl'])
        | Q(publication=publication, pub_art_id=publication_id))
    if len(query) > 0:
        # Duplicate!!!
        return None
    article = Article()
    article.title = art['title']
    article.summary = art['summary']
    article.text = art['text']
    article.publish_date = art['publication']
    article.url = art['absoluteUrl']
    if type(art['author']) == str:
        article.author = [art['author']]
    elif type(art['author']) == list:
        article.author = art['author']
    else:
        print('author error')
    # article.keywords = art['keywords']
    # article.categories = art['title']
    article.publication = publication

    article.pub_art_id = publication_id
    return article
Esempio n. 7
0
    def save_training_tokens(self):
        already_tokenized_ids, n_files = self.get_tokenized_articles_list()
        print(f'Found {len(already_tokenized_ids)} already tokenized articles')
        try:
            publication = Publication.objects(name=self.publication_name).get()
        except:
            print(
                'Problemas buscando publicación. Creo la publicación? Tiene el mismo nombre en config y config_train'
            )
            return -1
        print(publication)
        articles = Article.objects(
            publication=publication).order_by('-publish_date')
        N = articles.count()

        if N == 0:
            print(
                f'No hay articulos en la base de datos. Correr fetch_articles')
            return -1
        else:
            print(f'Total number to tokenize: {N}')
        # N_chunks = np.ceil(N/self.chunk_size)
        texts = []
        titles = []
        texts_titles = []
        ids = []
        # Es necesario para arrancar con la cantidad que había +1
        chunk = n_files
        training_data_folder = self.get_training_folder()
        if not os.path.exists(training_data_folder):
            os.makedirs(training_data_folder)

        for i, article in enumerate(articles):
            if len(ids) % self.chunk_size == 0 and len(ids) != 0:
                chunk += 1
                file_name = f'{training_data_folder}all_{chunk}.npy'
                np.save(file_name, texts_titles)
                texts_titles = []
                file_name = f'{training_data_folder}titles_{chunk}.npy'
                np.save(file_name, titles)
                titles = []
                file_name = f'{training_data_folder}content_{chunk}.npy'
                np.save(file_name, texts)
                texts = []

                print()
                print(f'{file_name} saved!')
                file_name_ids = f'{training_data_folder}ids_{chunk}.npy'
                np.save(file_name_ids, ids)
                ids = []
            if str(article['id']) not in already_tokenized_ids:
                text, title = Train.article2text(article)
                print(f'\r{i}/{N}', end=' ')
                doc_text = self.nlp(text)
                doc_title = self.nlp(title)
                tokens_text = RelatedArticles.doc2tokens(doc_text)
                tokens_title = RelatedArticles.doc2tokens(doc_title)
                texts.append(tokens_text)
                titles.append(tokens_title)
                texts_titles.append(tokens_title + ['\n'] + tokens_text)
                ids.append(str(article['id']))

        if len(ids) > 0:
            chunk += 1
            file_name = f'{training_data_folder}all_{chunk}.npy'
            np.save(file_name, texts_titles)
            texts_titles = []
            file_name = f'{training_data_folder}titles_{chunk}.npy'
            np.save(file_name, titles)
            titles = []
            file_name = f'{training_data_folder}content_{chunk}.npy'
            np.save(file_name, texts)
            texts = []
            print()
            print(f'{file_name} saved!')
            file_name_ids = f'{training_data_folder}ids_{chunk}.npy'
            np.save(file_name_ids, ids)

        return 0
Esempio n. 8
0
def fetch_articles(publication_name,
                   art_to_db=wordpress_to_db,
                   get_url=get_wp_url,
                   get_articles=get_wp_articles,
                   api_url=None,
                   per_page=50,
                   starting_page=1,
                   date_after=None,
                   date_before=None):
    """
    Fetch articles from publication
  
    Fetch articles from publication by date and independent of publication (wordpress, iProfesional). Saves them to mongodb database
  
    Parameters: 
    publication_name (str): name of publication in db. Needs it to get api url from db
    art_to_db (func): iProfesional_to_db or wordpress_to_db are the only supported for the moment 
    get_url (func):  get_wp_url or get_iProfesional_url are the only supported for the moment 
    get_articles (func): get_iProfesional_articles or get_wp_articles are the only supported for the moment 
    api_url (str): if not None rewrite api_url in db
    per_page (int): number of articles per page
    starting_page (int): page number - First page is 1
    date_after (str): Get articles after this date with format: '%Y-%m-%d'
    date_before (str): Get articles before this date with format: '%Y-%m-%d'

    Returns: 
    None
    """
    publication = Publication.objects(name=publication_name).get()
    if api_url is not None:
        publication.api_url = api_url
        publication.save()

    articles = Article.objects(
        publication=publication).order_by('-publish_date').limit(1).first()
    if articles is None or len(articles) == 0:
        # No hay articulos
        print('No articles')
        if date_after is None:
            date_after = datetime.date.fromtimestamp(-10000000000)
    else:
        if date_after is None:
            date_after = articles['publish_date']

    if date_before is None:
        date_before = datetime.datetime.now()

    url_endpoint = publication.api_url
    if url_endpoint is None:
        print('api_url not defined in publication')
        return
    page = starting_page
    total_pages = None
    while True:
        url = get_url(url_endpoint, page, per_page, date_after, date_before)
        # url = f'{url_endpoint}posts?page={page}&per_page={per_page}&orderby=date&order=asc&after={date_str}'

        if total_pages:
            print(f'\rPage: {page}/{total_pages} - {url}', end='')
        else:
            print(f'\rPage: {page} - {url}', end='')

        response = requests.get(url)
        articles, total_pages = get_articles(
            response)  # int(response.headers['X-WP-TotalPages'])

        for article in articles:
            art = art_to_db(article, publication_name)
            # print(art.publish_date)
            if art is not None:
                art.save()
            else:
                print('\rAlready in DB')

        if page == total_pages or total_pages == 0:
            break
        page += 1
        if 'code' in articles:
            print()
            print(articles['code'])
            break