def run(self, query={}, force=False, authenticate=False): '''imported from MongoDB''' self.version = '.1' self.functiontype = 'importer' self.date = datetime.datetime(year=2017, month=4, day=4) databasename = config.get('mongodb', 'databasename') collectionname = config.get('mongodb', 'collectionname') username = config.get('mongodb', 'username') password = config.get('mongodb', 'password') client = pymongo.MongoClient(config.get('mongodb', 'url')) db = client[databasename] if authenticate: db.authenticate(username, password) collection = db[collectionname] mapping = { 'doctype': 'source', 'url': 'url', '_id': 'rssidentifier', 'publication_date': 'datum', 'text': 'text', 'teaser': 'teaser', 'title': 'title', 'byline': 'byline', 'bylinesource': 'bylinesource', 'category': 'section', 'url': 'url' } input_iterable = collection.find(query) for num, inputdoc in enumerate(input_iterable): document = {} for k, v in mapping.items(): try: document[k] = inputdoc[v] except: logger.debug('key {} not found'.format(k)) logger.info('processing {num}'.format(**locals())) self.doctype = document.get('doctype') document = self._add_metadata(document) self._verify(document) # logger.debug('about to save',document) # print('\n'*10) try: self._save_document(document, forced=force) logger.debug("Stored document {} in ES".format(num)) except Exception as e: logger.warning( "ACK, unable to import document number {num}: {e}".format( **locals()))
def _doctype_query_or_list(doctype_query_or_list, force=False, field=None, task=None): ''' This function helps other functions dynamically interpret the argument for document selection. It allows for either a list of documents, an elasticsearch query, a string-query or a doctype string to be provided and returns an iterable containing these documents. Parameters ---------- doctype_query_or_list: list, string or dict specification of input document, either: a list, each element should be an elasticsearch document a dict, should be an elasticsearch query a string, which is either an exact match with doctype (checked against doctype mappings) or alternatively, a query_string for the elasticsearch database force: bool (defautl=False) whether existing fields should be re-computed. Used to subset to documents were field is missing. field: string (default=None) Field on which operations are done, used to check when force=False task: string (default=None) Function for which the documents are used. Argument is used only to generate the expected outcome fieldname, i.e. <field>_<function> Returns ------- Iterable ''' if type(doctype_query_or_list) == list: documents = doctype_query_or_list elif type(doctype_query_or_list) == str: if doctype_query_or_list in core.database.client.indices.get_mapping()[ config.get('elasticsearch', 'document_index')]['mappings'].keys(): logger.info("assuming documents of given type should be processed") if force or not field: documents = core.database.scroll_query({ 'filter': { 'match': { 'doctype': "%s" % doctype_query_or_list } } }) elif not force and field: logger.info( "force=False, ignoring documents where the result key exists (and has non-NULL value)" ) documents = core.database.scroll_query({ 'filter': { 'and': [{ 'match': { 'doctype': doctype_query_or_list } }, { 'missing': { 'field': '%s_%s' % (field, task) } }] } }) else: logger.info("assuming input is a query_string") if force or not field: documents = core.database.scroll_query({ 'filter': { 'query_string': { 'query': doctype_query_or_list } } }) elif not force and field: logger.info( "force=False, ignoring documents where the result key exists (and has non-NULL value)" ) documents = core.database.scroll_query({ 'filter': { 'and': [{ 'missing': { 'field': '%s_%s' % (field, task) } }, { 'query_string': { 'query': doctype_query_or_list } }] } }) else: if not force and field and task and not doctype_query_or_list: field = '%s_%s' % (field, task) doctype_query_or_list.update( {'filter': { 'missing': { 'field': field } }}) documents = core.search_utils.scroll_query(doctype_query_or_list) return documents
from celery import Celery, group, chain, chord from flask import Flask import argparse import core import configparser import core.search_utils import core.taskmanager import datetime import processing # helps celery recognize the processing tasks import scrapers # helps celery recognize the scraping tasks import clients # helps celery recognize client tasks import analysis # helps celery recognize analysis tasks from core.database import config logging.basicConfig(level=config.get("inca", "loglevel")) LOCAL_ONLY = config.get('inca', 'local_only') == "True" api = Flask(__name__) taskmaster = Celery( backend=config.get('celery', '%s.backend' % config.get('inca', 'dependencies')), broker=config.get('celery', '%s.broker' % config.get('inca', 'dependencies')), ) taskmaster.conf.update( CELERYBEAT_SCHEDULE=core.celerybeat_schedule.get_scheduler()) expose = ["scrapers", "processing", "analysis", "clients", "inca"]
# -*- coding: utf-8 -*- from core.processor_class import Processer from core.database import config # from core.basic_utils import dotkeys import logging import requests from PIL import Image import imagehash import os import sys IS_PYTHON3 = sys.version_info[0] == 3 and sys.version_info[1] >= 2 logger = logging.getLogger(__name__) IMAGEPATH = config.get('imagestore', 'imagepath') def hash2filepath(myhash): ''' Returns a tuple consisting of the directory in which the image is to be stored and the filename itself. The filename is identical to the hash. ''' hashstr = str(myhash) path = os.path.join(IMAGEPATH, hashstr[:4], hashstr[4:8], hashstr[8:12], hashstr[12:]) filename = hashstr + '.jpg' return path, filename class download_images(Processer):
class Inca(): """INCA main class for easy access to functionality methods ---- Scrapers Retrieval methods for RSS websites. Most scrapers can run out-of-the-box without specifying any parameters. If no database is present, scrapers will return the data as a list. usage: docs = inca.scrapers.<scraper>() Rssscrapers Same as Scrapers, but based on the websites' RSS feeds. Clients API-clients to get data from various endpoints. You can start using client functionality by: 1) Adding an application, using the `<service>_create_app` method 2) Add credentials to that application, using `<service>_create_credentials` 3) Then run a collection method, such as `twitter_timeline`! usage: inca.clients.<service>_create_app(name='default') inca.clients.<service>_create_credentials(app='default') docs = inca.clients.<service>_<functionname>(app='default', *args, **kwargs) Processing These methods change documents by adding fields. Such manipulations can be things such as POS-tags, Sentiment or something else. usage: modified_docs = inca.processing.<processor>(docs=<original_docs or query>, field=<field to manipulate>, *args, **kwargs) """ _taskmaster = Celery( backend=config.get('celery', '%s.backend' % config.get('inca', 'dependencies')), broker=config.get('celery', '%s.broker' % config.get('inca', 'dependencies')), ) database = core.search_utils _prompt = "Placeholder" def __init__(self, prompt="TLI", distributed=False, verbose=True, debug=False): self._LOCAL_ONLY = distributed self._prompt = getattr(make_interface, prompt).prompt self._construct_tasks('scrapers') self._construct_tasks('processing') #self._analysis_task_constructor() self._construct_tasks('analysis') self._construct_tasks('clients') self._construct_tasks('importers_exporters') self._construct_tasks('rssscrapers') if verbose: logger.setLevel('INFO') logger.info("Providing verbose output") if debug: logger.setLevel('DEBUG') logger.debug("Activating debugmode") class analysis(): '''Data analysis tools''' pass class scrapers(): '''Scrapers for various (news) outlets''' pass class rssscrapers(): '''RSS-based crapers for various (news) outlets''' pass class processing(): '''Processing options to operate on documents''' pass class analysis(): '''Perform and summarize analysis done on documents''' pass def _analysis_task_constructor(self): """Construct endpoints specifically for analysis tasks This function is used when analysis tasks are encountered. The Analysis sub-classes include functionality for fitting, predicting, plotting, updateing and explaining results. """ target_functions = [ 'fit', 'predict', 'plot', 'interpretation', 'quality' ] for k, v in self._taskmaster.tasks.items(): functiontype = k.split('.', 1)[0] taskname = k.rsplit('.', 1)[1] if functiontype == "analysis": analysis_class = self._taskmaster.tasks[k] def makefunc(method): if inspect.isgeneratorfunction(method): def endpoint(*args, **kwargs): for i in method(*args, **kwargs): yield i else: def endpoint(*args, **kwargs): return method(*args, **kwargs) return endpoint class analysis_placeholder: pass analysis_placeholder.__doc__ = analysis_class.__doc__ for method in target_functions: endpoint = getattr(analysis_class, method) setattr(analysis_placeholder, method, endpoint) setattr(getattr(self, "analysis"), taskname, analysis_placeholder) class clients(): '''Clients to access (social media) APIs''' pass class importers_exporters(): '''Importing functions to ingest data ''' pass def _construct_tasks(self, function): """Construct the appropriate endoints from Celery tasks This function serves to create the appropriate functions in the Inca object by intro-specting available functions from the celery taskmaster. Subclasses of Task should then be added automatically. Parameters ---- function : string The type of function to add, such as 'scrapers' or 'processors' Returns None """ for k, v in self._taskmaster.tasks.items(): functiontype = k.split('.', 1)[0] taskname = k.rsplit('.', 1)[1] if functiontype == function: target_task = self._taskmaster.tasks[k] target_task.prompt = self._prompt is_client_main_class = hasattr( target_task, "service_name" ) and target_task.__name__ == target_task.service_name if is_client_main_class: setattr( getattr(self, function), "{service_name}_create_app".format( service_name=target_task.service_name), target_task.add_application) setattr( getattr(self, function), "{service_name}_remove_app".format( service_name=target_task.service_name), target_task.remove_application) setattr( getattr(self, function), "{service_name}_create_credentials".format( service_name=target_task.service_name), target_task.add_credentials) else: setattr(getattr(self, function), taskname, target_task.runwrap) function_class = getattr(self, function) leaf_class = self._taskmaster.tasks[k] method = leaf_class.runwrap def makefunc(method): if inspect.isgeneratorfunction(method): def endpoint(*args, **kwargs): for i in method(*args, **kwargs): yield i else: def endpoint(*args, **kwargs): return method(*args, **kwargs) return endpoint endpoint = makefunc(method) if function == 'scrapers' or function == 'rssscrapers': docstring = self._taskmaster.tasks[k].get.__doc__ elif function == "processing": docstring = self._taskmaster.tasks[k].process.__doc__ elif function == "importers_exporters": t = self._taskmaster.tasks[k] if hasattr(t, 'load'): docstring = t.load.__doc__ else: docstring = t.save.__doc__ else: docstring = self._taskmaster.tasks[k].__doc__ endpoint.__doc__ = docstring endpoint.__name__ = leaf_class.__name__ setattr(function_class, taskname, endpoint) def _summary(self): summary = '' summary += '\nTop 10 document types currently in database:\n' contents = self.database.list_doctypes().items() for k, v in sorted(self.database.list_doctypes().items(), key=lambda x: x[1], reverse=True)[:10]: summary += "{k:30} : {v:10}\n".format(**locals()) if len(contents) > 10: summary += "...\n" return summary