def main(): if not config_found: print('config.py file must be present') return print('config.py found') create_publications() if len(sys.argv) < 2 and config.fetching_config['publication'] is None: print('You need to pass publication_name as argv. For example:') print('python fetch_wordpress_articles.py "CNN esp"') return if len(sys.argv) == 2: pub_name = sys.argv[1] else: pub_name = config.fetching_config['publication'] pub = Publication.objects(name=pub_name).get() print(f'url ro fetch: {pub.api_url}') if pub == 'iProfesional': art_to_db = iProfesional_to_db get_url = get_iProfesional_url get_articles = get_iProfesional_articles else: art_to_db = wordpress_to_db get_url = get_wp_url get_articles = get_wp_articles fetch_articles(pub_name, art_to_db=art_to_db, get_url=get_url, get_articles=get_articles, api_url=pub.api_url, date_after=config.fetching_config['date_after'], date_before=config.fetching_config['date_before'])
def get_articles(): google = int(flask.request.args.get('google') or 0) aws = int(flask.request.args.get('aws') or 0) azure = int(flask.request.args.get('azure') or 0) selected_pub = ( flask.request.args.get('pub') or Publication.objects(name=config.active_publication).first().id) page_num = int(flask.request.args.get('page') or 1) per_page = int(flask.request.args.get('count') or 10) from_date = flask.request.args.get('from') or '2000-01-01' to_date = flask.request.args.get('to') or datetime.now().strftime( "%Y-%m-%d") # if to_date is None: # to_date = datetime.now().strftime("%Y-%m-%d") # if from_date is None: # from_date = '2000-01-01' cloud_args = {} if google == 1: cloud_args['ner_google_id__ne'] = None if aws == 1: cloud_args['ner_aws_id__ne'] = None if azure == 1: cloud_args['ner_azure_id__ne'] = None print(datetime.strptime(from_date, "%Y-%m-%d").date()) articles_page = Article.objects( publication=selected_pub, **cloud_args, publish_date__gte=datetime.strptime(from_date, "%Y-%m-%d").date(), publish_date__lte=datetime.strptime( to_date, "%Y-%m-%d").date()).order_by('-publish_date').paginate( page=page_num, per_page=per_page) pubs = [] for pub in Publication.objects(): if selected_pub == str(pub.id): selected = 'selected' else: selected = '' pubs.append([pub, selected]) return articles_page, pubs, cloud_args, from_date, to_date
def create_publications(): connect(config.database['db_name'], host=config.database['host'], port=config.database['port']) print('Verificando publicaciones') for pub_dict in config.publications: pub_list = Publication.objects(name=pub_dict.get('name')) if len(pub_list) == 0: new_pub = Publication(name=pub_dict.get('name'), url=pub_dict.get('url'), location=pub_dict.get('location'), fetch_method=pub_dict.get('fetch_method'), api_url=pub_dict.get('api_url')) new_pub.save() print(f'Publication creada: {pub_dict.get("name")}') else: exitent_pub = pub_list.get() exitent_pub.name = pub_dict.get('name') exitent_pub.url = pub_dict.get('url') exitent_pub.location = pub_dict.get('location') exitent_pub.fetch_method = pub_dict.get('fetch_method') exitent_pub.api_url = pub_dict.get('api_url') exitent_pub.save() print(f'Publication Modificada: {pub_dict.get("name")}') pubs = Publication.objects() print(f'Total de publicaciones en la db: {len(pubs)}') for p in pubs: print(f'- {p.name}') print('#################################')
def get_article_by_cms_id(publication_name, cms_id): """ Gets article from publication API by cmd_id Gets an article from publication API (like wordpress for example). At the moment only support wordpress Parameters: publication_name (str): name of publication cms_id (str): id of article to fetch Returns: json: article as JSON """ api_url = Publication.objects(name=publication_name).get()['api_url'] url_by_ids = get_wp_url_by_ids(api_url, [cms_id]) response = requests.get(url_by_ids) return response.json()[0]
def wordpress_to_db(art, publication_name): """ Converts wordpress data to mongodb format Parameters: art (json): wordpress article as JSON publication_name (str): name of publication Returns: json: article as Article instance """ publication = Publication.objects(name=publication_name).get() query = Article.objects( Q(url=art['link']) | Q(publication=publication, pub_art_id=str(art['id']))) if len(query) > 0: # Duplicate!!! return None if type(art['content']['rendered']) == bool: print('Article with no content') return article = Article() article.title = art['title']['rendered'] article.summary = art['excerpt']['rendered'] article.text = art['content']['rendered'] article.publish_date = datetime.datetime.strptime(art['date'], "%Y-%m-%dT%H:%M:%S") article.url = art['link'] if type(art['author']) == str: article.author = [art['author']] elif type(art['author']) == list: article.author = art['author'] elif type(art['author']) == int: article.author = [str(art['author'])] else: print('author error') # article.keywords = art['keywords'] # article.categories = art['title'] article.publication = publication publication_id = str(art['id']) article.pub_art_id = publication_id return article
def iProfesional_to_db(art, publication_name='iProfesional'): """ Converts iProfesional data to mongodb format Parameters: art (json): iProfesional article as JSON publication_name (str): should be iProfesional Returns: json: article as Article instance """ publication = Publication.objects(name=publication_name).get() publication_id = str(art['id']) query = Article.objects( Q(url=art['absoluteUrl']) | Q(publication=publication, pub_art_id=publication_id)) if len(query) > 0: # Duplicate!!! return None article = Article() article.title = art['title'] article.summary = art['summary'] article.text = art['text'] article.publish_date = art['publication'] article.url = art['absoluteUrl'] if type(art['author']) == str: article.author = [art['author']] elif type(art['author']) == list: article.author = art['author'] else: print('author error') # article.keywords = art['keywords'] # article.categories = art['title'] article.publication = publication article.pub_art_id = publication_id return article
def save_training_tokens(self): already_tokenized_ids, n_files = self.get_tokenized_articles_list() print(f'Found {len(already_tokenized_ids)} already tokenized articles') try: publication = Publication.objects(name=self.publication_name).get() except: print( 'Problemas buscando publicación. Creo la publicación? Tiene el mismo nombre en config y config_train' ) return -1 print(publication) articles = Article.objects( publication=publication).order_by('-publish_date') N = articles.count() if N == 0: print( f'No hay articulos en la base de datos. Correr fetch_articles') return -1 else: print(f'Total number to tokenize: {N}') # N_chunks = np.ceil(N/self.chunk_size) texts = [] titles = [] texts_titles = [] ids = [] # Es necesario para arrancar con la cantidad que había +1 chunk = n_files training_data_folder = self.get_training_folder() if not os.path.exists(training_data_folder): os.makedirs(training_data_folder) for i, article in enumerate(articles): if len(ids) % self.chunk_size == 0 and len(ids) != 0: chunk += 1 file_name = f'{training_data_folder}all_{chunk}.npy' np.save(file_name, texts_titles) texts_titles = [] file_name = f'{training_data_folder}titles_{chunk}.npy' np.save(file_name, titles) titles = [] file_name = f'{training_data_folder}content_{chunk}.npy' np.save(file_name, texts) texts = [] print() print(f'{file_name} saved!') file_name_ids = f'{training_data_folder}ids_{chunk}.npy' np.save(file_name_ids, ids) ids = [] if str(article['id']) not in already_tokenized_ids: text, title = Train.article2text(article) print(f'\r{i}/{N}', end=' ') doc_text = self.nlp(text) doc_title = self.nlp(title) tokens_text = RelatedArticles.doc2tokens(doc_text) tokens_title = RelatedArticles.doc2tokens(doc_title) texts.append(tokens_text) titles.append(tokens_title) texts_titles.append(tokens_title + ['\n'] + tokens_text) ids.append(str(article['id'])) if len(ids) > 0: chunk += 1 file_name = f'{training_data_folder}all_{chunk}.npy' np.save(file_name, texts_titles) texts_titles = [] file_name = f'{training_data_folder}titles_{chunk}.npy' np.save(file_name, titles) titles = [] file_name = f'{training_data_folder}content_{chunk}.npy' np.save(file_name, texts) texts = [] print() print(f'{file_name} saved!') file_name_ids = f'{training_data_folder}ids_{chunk}.npy' np.save(file_name_ids, ids) return 0
def fetch_articles(publication_name, art_to_db=wordpress_to_db, get_url=get_wp_url, get_articles=get_wp_articles, api_url=None, per_page=50, starting_page=1, date_after=None, date_before=None): """ Fetch articles from publication Fetch articles from publication by date and independent of publication (wordpress, iProfesional). Saves them to mongodb database Parameters: publication_name (str): name of publication in db. Needs it to get api url from db art_to_db (func): iProfesional_to_db or wordpress_to_db are the only supported for the moment get_url (func): get_wp_url or get_iProfesional_url are the only supported for the moment get_articles (func): get_iProfesional_articles or get_wp_articles are the only supported for the moment api_url (str): if not None rewrite api_url in db per_page (int): number of articles per page starting_page (int): page number - First page is 1 date_after (str): Get articles after this date with format: '%Y-%m-%d' date_before (str): Get articles before this date with format: '%Y-%m-%d' Returns: None """ publication = Publication.objects(name=publication_name).get() if api_url is not None: publication.api_url = api_url publication.save() articles = Article.objects( publication=publication).order_by('-publish_date').limit(1).first() if articles is None or len(articles) == 0: # No hay articulos print('No articles') if date_after is None: date_after = datetime.date.fromtimestamp(-10000000000) else: if date_after is None: date_after = articles['publish_date'] if date_before is None: date_before = datetime.datetime.now() url_endpoint = publication.api_url if url_endpoint is None: print('api_url not defined in publication') return page = starting_page total_pages = None while True: url = get_url(url_endpoint, page, per_page, date_after, date_before) # url = f'{url_endpoint}posts?page={page}&per_page={per_page}&orderby=date&order=asc&after={date_str}' if total_pages: print(f'\rPage: {page}/{total_pages} - {url}', end='') else: print(f'\rPage: {page} - {url}', end='') response = requests.get(url) articles, total_pages = get_articles( response) # int(response.headers['X-WP-TotalPages']) for article in articles: art = art_to_db(article, publication_name) # print(art.publish_date) if art is not None: art.save() else: print('\rAlready in DB') if page == total_pages or total_pages == 0: break page += 1 if 'code' in articles: print() print(articles['code']) break