def update_cache_profiles(self, unified_name_to_profiles: {}): """ Given a unified_profile_name to profiles map, merges the profiles and creates the FK references :param unified_name_to_profiles: :param unified_name_to_real_name: :return: """ sql = self._update_cache_profiles[0] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Updating cache profiles") for _, profile_list in unified_name_to_profiles.items(): # flatten the profile list down to one profile reference_profile = None """:type : Profile""" for profile in profile_list: if reference_profile is None or len(profile.display_name) > len(reference_profile.display_name): reference_profile = profile # if we found at least one reference_profile (which we should) # add the corresponding sql insert string to the cache_profile_strings array if reference_profile is not None: u, r = unify_profile_name(reference_profile.first_name, reference_profile.last_name) b64u = generate_id(u) log.info("inserting %s, %s" % (b64u, sanitize_text(r))) conn.execute(sql, (b64u, sanitize_text(r))) log.info("Cache profiles have been updated")
def get_profiles_by_profile_ids_or_field_ids(self, profile_ids: [int], field_ids: [int]): """ Given a list of profile ids and field ids, queries all profiles that belong to the research field OR are associated with the profile_ids. :param slim: :return: """ profile_ids_string = "" field_ids_string = "" if len(profile_ids) > 0: profile_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, profile_ids))) else: profile_ids_string = "(NULL)" if len(field_ids) > 0: field_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, field_ids))) else: field_ids_string = "(NULL)" query = self._query_profiles_by_profile_ids_or_field_ids[0] query = re.sub(':profile_ids', profile_ids_string, query) query = re.sub(':field_ids', field_ids_string, query) log.info("Querying profiles by profile_ids and field_ids\n" "\t| profile_ids: {profile_ids}\n" "\t| field_ids: {field_ids}\n".format( profile_ids=profile_ids_string, field_ids=field_ids_string )) # Fire the sql script in a transaction with self._engine.begin() as conn: return conn.execute(query).fetchall()
def crawl_profiles(self): """ Given a populated members array this function crawls the profiles linked to the ids as well as the publications :return: """ log.debug("Adding members to worker queues") for member in self._members: self._profile_queue.put(member.profile_id) self._profile_documents_queue.put(member.profile_id) # Create profile crawlers log.debug("Spawning profile workers") for i in range(number_profile_workers): t = Thread(target=self.profile_worker) t.daemon = False t.start() # Create document crawlers log.debug("Spawning document crawlers") for i in range(number_document_workers): t = Thread(target=self.document_worker) t.daemon = False t.start() # Wait for both queues to complete self._profile_queue.join() self._profile_documents_queue.join() log.info("Profiles and associated documents have been fetched")
def execute( self, profiles, documents, unified_name_to_profiles, unified_document_title_to_documents, unified_field_title_to_field, unified_field_title_to_documents, unified_name_to_authored_documents, unified_name_to_participated_documents, ): """ Given the required crawl data updates the whole cache :return: """ log.info("Crawl data update has been started") self.update_cache_profiles(unified_name_to_profiles) self.update_cache_documents(unified_document_title_to_documents) self.update_profiles(profiles) self.update_documents(documents) self.update_cache_fields(unified_field_title_to_field) self.link_profiles_to_documents( unified_name_to_profiles, unified_name_to_authored_documents, unified_name_to_participated_documents ) self.link_fields_to_documents(unified_field_title_to_documents) self.post_update() log.info("Crawl data has been updated")
def update_cache_documents(self, unified_document_title_to_documents: {}): """ Given a unified_document_title to documents map, merges the documents and creates the FK references :param unified_document_title_to_documents: :return: """ sql = self._update_cache_documents[0] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Updating cache documents") for _, doc_list in unified_document_title_to_documents.items(): # flatten the document list down to one document reference_doc = None """:type : Document""" for doc in doc_list: if reference_doc is None or doc.core_last_modified > reference_doc.core_last_modified: reference_doc = doc # if we found at least one reference_doc (which we should), # add the corresponding sql insert string to the cache_document_strings array if reference_doc is not None: u, r = unify_document_title(reference_doc.core_title) b64u = generate_id(u) conn.execute(sql, (b64u, sanitize_text(r))) log.info("Cache documents have been updated")
def get_profiles(self): log.info('The route GET /profiles/ has been triggered') # Default parameters profile_ids = '' field_ids = '' slim = False # Set passed query parameters if existing if 'profile-ids' in request.args: profile_ids = request.args['profile-ids'].split(',') log.debug('Query parameter "profile-ids" = %s' % profile_ids) if 'field-ids' in request.args: field_ids = request.args['field-ids'].split(',') log.debug('Query parameter "field_ids" = %s' % field_ids) if 'slim' in request.args: slim = bool(request.args['slim']) log.debug('Query parameter "slim" = %s' % slim) # Trigger the respective methods profiles = [] if slim: profiles = self._data_controller.api_data.get_profiles_slim() else: profiles = self._data_controller.api_data.get_profiles_by_profile_ids_or_field_ids( profile_ids=profile_ids, field_ids=field_ids) # Pattern for cms pages page_pattern = self._cache_config.profile_page_pattern # Serialize documents response = [] for profile in profiles: profile_dict = dict(profile) # names name = None first_name = None last_name = None # Get names if 'first_name' in profile_dict and 'last_name' in profile_dict: first_name = profile_dict['first_name'] last_name = profile_dict['last_name'] elif 'name' in profile_dict: name = profile_dict['name'] name_parts = [s.lower() for i, s in enumerate(name.split())] first_name = name_parts[0] last_name = name_parts[1] # If the names are available create the page link if first_name is not None and last_name is not None: page = page_pattern page = re.sub(':firstname', first_name, page) page = re.sub(':lastname', last_name, page) profile_dict["page"] = page response.append(profile_dict) return json.dumps(response, cls=DefaultEncoder)
def assert_schema(self): if self.is_initialized(): log.info("Schema is already initialized") else: log.warning("The current schema is incomplete. Starting migration.") # TODO: Backup && Restore as soon as the database has state self.drop_all() self.run_schema()
def post_update(self): """ Executes all linking steps that are required for the queries :return: """ with self._engine.begin() as conn: for stmt in self._post_update: conn.execute(stmt) log.info("Cleanup statements have been executed")
def process_profile_documents(self): """ Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles :return: """ for profile_unified in self._unified_name_to_profiles: found_docs = [] profiles = self._unified_name_to_profiles[profile_unified] if len(profiles) == 0: log.warning("There were no profiles for the unified name %s" % profile_unified) continue # For each profile linked to that unified name, add the found documents to the list for profile in profiles: x = self._profile_docs[profile.identifier] log.debug( "Used {len_x} documents from id {mendeley_id} for unified name {name}" .format(len_x=len(x), mendeley_id=profile.identifier, name=unify_profile_name(profile.first_name, profile.last_name))) found_docs += x # Process these documents for doc in found_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[ doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [ doc ] # Append the doc title to the authored_docs of that unified profile name authored_docs = self._unified_name_to_authored_documents[ profile_unified] authored_docs.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fields of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Profile documents have been analyzed")
def crawl_group_members(self): """ Fetches members of the pre-configured research group :return: """ self._members = self._crawler.get_group_members(self._research_group) log.debug( "{num} group members have been fetched for group_id {group_id}". format(num=len(self._members), group_id=self._research_group)) log.info("Group members have been fetched")
def execute(self): """ Process all input :return: """ self.reset() self.process_profiles() self.process_profile_documents() self.process_group_documents() log.info("Analysis has been executed")
def get_last_update(self): """ Returns the last entry of the update_log :return: """ query = self._query_last_update[0] log.info("Querying last update") with self._engine.begin() as conn: return conn.execute(query).fetchall()
def update_profiles(self, profiles: [Profile]): """ Given a profile list, this method replaces the profiles in the database with new ones :param docs: :return: """ def insert_profile(conn: Connection, insert: str, p: Profile): u, _ = unify_profile_name(p.first_name, p.last_name) b64u = generate_id(u) conn.execute( insert, ( sanitize_text(p.identifier), b64u, sanitize_text(p.first_name), sanitize_text(p.last_name), sanitize_text(p.display_name), sanitize_text(p.link), ), ) # If there's nothing to insert, abort if len(profiles) == 0: return None delete = self._replace_profiles[0] insert = self._replace_profiles[1] temp = self._replace_profiles[2] temp_insert = self._replace_profiles[3] update = self._replace_profiles[4] temp_drop = self._replace_profiles[5] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Deleting existing profiles") conn.execute(delete) log.debug("Inserting new profiles") for profile in profiles: insert_profile(conn, insert, profile) log.debug("Creating temporary table") conn.execute(temp) log.debug("Spooling data into temporary table") conn.execute(temp_insert) log.debug("Creating profile links") conn.execute(update) log.debug("Dropping temporary table") conn.execute(temp_drop) log.info("Profiles have been updated")
def get_profiles_slim(self): """ Query slim profiles for fast UI auto completion :param profile_ids: :return: """ query = self._query_profiles_slim[0] log.info("Querying slim profiles") return self._engine.execute(query).fetchall()
def get_fields(self): log.info('The route GET /fields/ has been triggered') fields = self._data_controller.api_data.get_fields() # Serialize fields response = [] for field in fields: field_dict = dict(field.items()) response.append(field_dict) return json.dumps(response, cls=DefaultEncoder)
def __init__(self, app_id: str, app_secret: str): self._app_id = app_id self._app_secret = app_secret self._initialized = False self._mendeley = Mendeley(app_id, app_secret) self._session = None """:type : MendeleySession """ log.info( "Intialized SDKCrawler with app_id: {app_id} and app_secret: {app_secret}" .format(app_id=app_id, app_secret=app_secret))
def get_entities(self): """ Returns the number of elements in each table :return: """ query = self._query_entities[0] log.info("Querying entity numbers") # Fire the sql script in a transaction with self._engine.begin() as conn: return conn.execute(query).fetchall()
def crawl_group_members(self): """ Fetches members of the pre-configured research group :return: """ self._members = self._crawler.get_group_members(self._research_group) log.debug("{num} group members have been fetched for group_id {group_id}".format( num=len(self._members), group_id=self._research_group )) log.info("Group members have been fetched")
def crawl_group_documents(self): """ Fetches the publications that are associated with the pre-configured group :return: """ self._group_documents = self._crawler.get_documents_by_group_id(self._research_group) log.debug("{num} documents have been fetched for group_id {group_id}".format( num=len(self._group_documents), group_id=self._research_group )) log.info("Group documents have been fetched")
def crawl_group_documents(self): """ Fetches the publications that are associated with the pre-configured group :return: """ self._group_documents = self._crawler.get_documents_by_group_id( self._research_group) log.debug( "{num} documents have been fetched for group_id {group_id}".format( num=len(self._group_documents), group_id=self._research_group)) log.info("Group documents have been fetched")
def get_system_entities(self): log.info('The route GET /cache/entities has been triggered') json_result = dict() entities = self._data_controller.api_data.get_entities() # Serialize fields response = [] for entity in entities: columns = dict(entity.items()) response.append(columns) return json.dumps(response, cls=DefaultEncoder)
def get_system_entities(self): log.info("The route GET /cache/entities has been triggered") json_result = dict() entities = self._data_controller.api_data.get_entities() # Serialize fields response = [] for entity in entities: columns = dict(entity.items()) response.append(columns) return json.dumps(response, cls=DefaultEncoder)
def prepare(self, profiles, profile_docs, group_docs): """ Prepare the AnalysisController with data :param profiles: :param profile_docs :param group_docs: :return: """ self._profiles = profiles self._profile_docs = profile_docs self._group_docs = group_docs log.info("Analysis has been prepared")
def get_fields(self): """ Queries all research fields :return: """ query = self._query_fields[0] log.info("Querying fields") # Fire the sql script in a transaction with self._engine.begin() as conn: return conn.execute(query).fetchall()
def post_update(self): log.info('The route POST /cache/update has been triggered') # Get remote IP remote = get_remote_ip() # Trigger the pipeline report = self._pipeline_controller.execute(remote) # Dump report report_dict = dict(report.__dict__) json_report = json.dumps(report_dict, cls=DefaultEncoder) return json_report
def __init__(self, app_id: str, app_secret: str): self._app_id = app_id self._app_secret = app_secret self._initialized = False self._mendeley = Mendeley(app_id, app_secret) self._session = None """:type : MendeleySession """ log.info("Intialized SDKCrawler with app_id: {app_id} and app_secret: {app_secret}".format( app_id=app_id, app_secret=app_secret ))
def post_update(self): log.info("The route POST /cache/update has been triggered") # Get remote IP remote = get_remote_ip() # Trigger the pipeline report = self._pipeline_controller.execute(remote) # Dump report report_dict = dict(report.__dict__) json_report = json.dumps(report_dict, cls=DefaultEncoder) return json_report
def process_profile_documents(self): """ Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles :return: """ for profile_unified in self._unified_name_to_profiles: found_docs = [] profiles = self._unified_name_to_profiles[profile_unified] if len(profiles) == 0: log.warning("There were no profiles for the unified name %s" % profile_unified) continue # For each profile linked to that unified name, add the found documents to the list for profile in profiles: x = self._profile_docs[profile.identifier] log.debug("Used {len_x} documents from id {mendeley_id} for unified name {name}".format( len_x=len(x), mendeley_id=profile.identifier, name=unify_profile_name(profile.first_name, profile.last_name) )) found_docs += x # Process these documents for doc in found_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [doc] # Append the doc title to the authored_docs of that unified profile name authored_docs = self._unified_name_to_authored_documents[profile_unified] authored_docs.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fields of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Profile documents have been analyzed")
def execute(self): """ Subsequently trigger crawler for members, group_publications and profiles :return: """ log.info("Crawler has been started") self.reset() self._crawler.prepare() self.crawl_group_members() self.crawl_group_documents() self.crawl_profiles() self._crawler.destroy() self._succeeded = True log.info("Crawler has been executed")
def run_schema(self): """ Runs the schema initialization and returns if it was successfull """ schema = [] if self._config.engine == "sqlite": schema = read_sql_statements('sql', 'schema', 'sqlite.sql') elif self._config.engine == "mysql": schema = read_sql_statements('sql', 'schema', 'mysql.sql') with self._engine.begin() as conn: for cmd in schema: conn.execute(cmd) log.info("Schema has been initialized")
def get_system_status(self): log.info('The route GET /cache/status has been triggered') api_online = remote_is_online("api.mendeley.com", 443) json_result = dict() json_result["serverVersion"] = self._config.version json_result["mendeleyStatus"] = "Online" if api_online else "Offline" json_result["lastUpdate"] = "never" # Fetch last entry in update_log last_update_log = self._data_controller.api_data.get_last_update() if len(last_update_log) > 0: json_result["lastUpdate"] = last_update_log[0]["dt"] return json.dumps(json_result, cls=DefaultEncoder)
def get_system_status(self): log.info("The route GET /cache/status has been triggered") api_online = remote_is_online("api.mendeley.com", 443) json_result = dict() json_result["serverVersion"] = self._config.version json_result["mendeleyStatus"] = "Online" if api_online else "Offline" json_result["lastUpdate"] = "never" # Fetch last entry in update_log last_update_log = self._data_controller.api_data.get_last_update() if len(last_update_log) > 0: json_result["lastUpdate"] = last_update_log[0]["dt"] return json.dumps(json_result, cls=DefaultEncoder)
def update_cache_fields(self, unified_field_title_to_field: {}): """ Given a unified_field_title to field map, updates the fields :param unified_field_title_to_field: :return: """ sql = self._update_cache_fields[0] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Updating cache fields") for _, field in unified_field_title_to_field.items(): b64u = generate_id(field.unified_title) conn.execute(sql, (b64u, sanitize_text(field.title))) log.info("Cache fields have been updated")
def log_update(self, report, remote_addr: str): if remote_addr is None: remote_addr = "localhost" insert = self._log_update[0] with self._engine.begin() as conn: conn.execute( insert, ( remote_addr, report.profiles, report.documents, report.unified_profiles, report.unified_documents, report.fields, report.field_links, ), ) log.info("Update has been logged for address '%s'" % remote_addr)
def process_group_documents(self): """ Iterates over the group documents, finds research fields, finds duplicates, finds author profiles :return: """ for doc in self._group_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[ doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [doc] # Try to find the main owner of the document through the document profile_id # If not existent do nothing # (we can't do much only with the profile_id. # We could post-fetch the unknown profiles but that is more involved) profile_id = doc.core_profile_id if profile_id in self._profiles: profile = self._profiles[profile_id] unified_name, real_name = unify_profile_name( profile.first_name, profile.last_name) if unified_name in self._unified_name_to_authored_documents: authored_documents = self._unified_name_to_authored_documents[ unified_name] authored_documents.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fiels of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Group documents have been analyzed")
def link_fields_to_documents(self, unified_field_title_to_documents: {}): """ Given a unified_field_title to documents map, creates the N:M relations in the database :param unified_field_title_to_documents: :return: """ # Get the different statements in the sql file delete = self._link_fields_to_documents[0] insert = self._link_fields_to_documents[1] # Fire the sql scripts in a transaction with self._engine.begin() as conn: log.debug("Deleting previous field -> document links") conn.execute(delete) log.debug("Inserting new field -> document links") for unified_field_title, doc_list in unified_field_title_to_documents.items(): for doc_unified in doc_list: conn.execute(insert, (generate_id(doc_unified), generate_id(unified_field_title))) log.info("Field -> document links have been updated")
def process_profiles(self): """ Iterates over the profiles and finds duplicates :return: """ for profile in self._profiles: unified, real = unify_profile_name(profile.first_name, profile.last_name) # Check if the name is already stored in the profiles # Then store the additional profile existing_profiles = [] if unified in self._unified_name_to_profiles: existing_profiles = self._unified_name_to_profiles[unified] existing_profiles.append(profile) self._unified_name_to_profiles[unified] = existing_profiles # Store empty entries in documents maps for that profile # (then we don't need to check the key every time) self._unified_name_to_authored_documents[unified] = set() self._unified_name_to_participated_documents[unified] = set() log.info("Profiles have been analyzed")
def link_profiles_to_documents( self, unified_name_to_profiles: {}, unified_name_to_authored_documents: {}, unified_name_to_participated_documents: {}, ): """ Given a unified_profile_name to authored_documents and participated_documents map(s), creates the N:M relations in the database :param unified_name_to_authored_documents: :param unified_name_to_participated_documents: :return: """ # Get the different statements in the sql file delete = self._link_profiles_to_documents[0] insert = self._link_profiles_to_documents[1] # Fire the sql scripts in a transaction with self._engine.begin() as conn: log.debug("Deleting previous profile -> document links") conn.execute(delete) log.debug("Inserting new profile -> document links") for unified_name, doc_list in unified_name_to_authored_documents.items(): # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise) if unified_name not in unified_name_to_profiles: continue for doc_unified in doc_list: conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified))) for unified_name, doc_list in unified_name_to_participated_documents.items(): # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise) if unified_name not in unified_name_to_profiles: continue for doc_unified in doc_list: conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified))) log.info("Profile -> document links have been updated")
def process_group_documents(self): """ Iterates over the group documents, finds research fields, finds duplicates, finds author profiles :return: """ for doc in self._group_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [doc] # Try to find the main owner of the document through the document profile_id # If not existent do nothing # (we can't do much only with the profile_id. # We could post-fetch the unknown profiles but that is more involved) profile_id = doc.core_profile_id if profile_id in self._profiles: profile = self._profiles[profile_id] unified_name, real_name = unify_profile_name(profile.first_name, profile.last_name) if unified_name in self._unified_name_to_authored_documents: authored_documents = self._unified_name_to_authored_documents[unified_name] authored_documents.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fiels of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Group documents have been analyzed")
def create_engine(config: DatabaseConfiguration) -> Engine: path = "" log_path = path if not config.path: path = "sqlite://" log_path = path else: path = "sqlite:///{path}".format( path=config.path ) log_path = path log.info("Creating engine '{engine}' with path {path}".format( engine="sqlite", path=log_path )) # create engine # Pool recycle: # http://stackoverflow.com/questions/26891971/mysql-connection-not-available-when-use-sqlalchemymysql-and-flask return sqlalchemy.create_engine(path, encoding="utf-8", pool_recycle=3600)
def drop_all(self): drops = read_sql_statements('sql', 'schema', 'drop_all.sql') foreign_key_off = "" foreign_key_on = "" if self._config.engine == "mysql": foreign_key_off = "SET FOREIGN_KEY_CHECKS = 0" foreign_key_on = "SET FOREIGN_KEY_CHECKS = 1" elif self._config.engine == "sqlite": foreign_key_off = "PRAGMA foreign_keys = OFF" foreign_key_on = "PRAGMA foreign_keys = ON" with self._engine.begin() as conn: log.info(foreign_key_off) conn.execute(foreign_key_off) for drop in drops: log.info(drop) conn.execute(drop) log.info(foreign_key_on) conn.execute(foreign_key_on) log.info("Database has been dropped")
def get_documents_by_profile_ids_and_field_ids(self, profile_ids: [int], field_ids: [int], order_attr: str="year", order_dir: str="desc", limit: int=0, offset: int=0, only_count: bool=False): """ Given profile ids and field ids, queries all documents that belong to the research field AND are associated with these profiles :return: """ profile_ids_string = "" field_ids_string = "" query_limit = 20 query_offset = 0 query_order_attr = "pub_year" query_order_dir = "ASC" if len(profile_ids) > 0: profile_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, profile_ids))) else: profile_ids_string = "(NULL)" if len(field_ids) > 0: field_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, field_ids))) else: field_ids_string = "(NULL)" # Check order attribute parameter if order_attr == "year": query_order_attr = "d.pub_year" elif order_attr == "title": query_order_attr = "d.title" elif order_attr == "source": query_order_attr = "d.source" # Check order direction if order_dir == "desc": query_order_dir = "DESC" elif order_dir == "asc": query_order_dir = "ASC" # Check limit parameter if limit > 0: query_limit = limit # Check offset parameter if offset > 0: query_offset = offset # If no profile_ids and field_ids have been passed, i need to return everything # && use query without AND xx IN () query = "" if len(profile_ids) > 0 and len(field_ids) > 0: query = self._query_documents_by_profile_ids_and_field_ids[0] query = re.sub(':profile_ids', profile_ids_string, query) query = re.sub(':field_ids', field_ids_string, query) elif len(profile_ids) > 0 and len(field_ids) == 0: query = self._query_documents_by_profile_ids[0] query = re.sub(':profile_ids', profile_ids_string, query) elif len(profile_ids) == 0 and len(field_ids) > 0: query = self._query_documents_by_field_ids[0] query = re.sub(':field_ids', field_ids_string, query) else: query = self._query_all_documents[0] if only_count: select = "SELECT COUNT(DISTINCT cd.id) AS cnt FROM" query = re.sub(query_head, select, query) query = re.sub('ORDER BY :order_by', '', query) query = re.sub('LIMIT :query_limit', '', query) else: select = str( "DISTINCT " "cd.id AS id," "d.mendeley_id AS mendeley_id," "d.title AS title," "d.doc_type AS doc_type," "d.last_modified AS last_modified," "d.abstract AS abstract," "d.source AS source," "d.pub_year AS pub_year," "d.authors AS authors," "d.keywords AS keywords," "d.tags AS tags," "d.derived_bibtex AS derived_bibtex") query = re.sub(':select_attributes', select, query) # Substitute order_by and query_limit as well query = re.sub(':order_by', '{order_attr} {order_dir}'.format( order_attr=query_order_attr, order_dir=query_order_dir ), query) query = re.sub(':query_limit', '{offset},{limit}'.format( offset=query_offset, limit=query_limit ), query) log.info("Querying documents by profile_ids and field_ids\n" "\t| profile_ids: {profile_ids}\n" "\t| field_ids: {field_ids}\n" "\t| order_attr: {order_attr}\n" "\t| order_dir: {order_dir}\n" "\t| offset: {offset}\n" "\t| limit: {limit}\n" "\t| only_count: {only_count}".format( profile_ids=profile_ids_string, field_ids=field_ids_string, order_attr=query_order_attr, order_dir=query_order_dir, offset=query_offset, limit=query_limit, only_count=only_count )) log.debug("Query: {query}".format(query=query)) # Fire the sql script in a transaction with self._engine.begin() as conn: return conn.execute(query).fetchall()
def get_documents(self): log.info('The route GET /documents/ has been triggered') # Default parameters profile_ids = '' field_ids = '' limit = 0 offset = 0 order_dir = "" order_attr = "" only_count = False # Set passed query parameters if existing if 'profile-ids' in request.args: profile_ids = request.args['profile-ids'].split(',') log.debug('Query parameter "profile-ids" = %s' % profile_ids) if 'field-ids' in request.args: field_ids = request.args['field-ids'].split(',') log.debug('Query parameter "field-ids" = %s' % field_ids) if 'limit' in request.args: limit = int(request.args['limit']) log.debug('Query parameter "limit" = %s' % limit) if 'offset' in request.args: offset = int(request.args['offset']) log.debug('Query parameter "offset" = %s' % offset) if 'order-dir' in request.args: order_dir = request.args['order-dir'] log.debug('Query parameter "order-dir" = %s' % order_dir) if 'order-attr' in request.args: order_attr = request.args['order-attr'] log.debug('Query parameter "order-attr" = %s' % order_attr) if 'only-count' in request.args: only_count = bool(request.args['only-count']) log.debug('Query parameter "only-count" = %s' % only_count) # Trigger the respective methods data = self._data_controller.api_data.get_documents_by_profile_ids_and_field_ids( profile_ids=profile_ids, field_ids=field_ids, order_attr=order_attr, order_dir=order_dir, offset=offset, limit=limit, only_count=only_count) if only_count: # Extract count response = [] for document in data: document_dict = dict(document.items()) response.append(document_dict) if len(response) > 0: return json.dumps(response[0], cls=DefaultEncoder) else: return json.dumps({"cnt": 0}, cls=DefaultEncoder) else: # Serialize documents response = [] for document in data: document_dict = dict(document.items()) response.append(document_dict) return json.dumps(response, cls=DefaultEncoder)
def __init__(self, *args, **kwargs): super(MendeleyCache, self).__init__(*args, **kwargs) # Read configuration self.configuration = ServiceConfiguration() self.configuration.load() log.info("Configuration has been loaded") # Create service controllers self.data_controller = DataController(self.configuration.database) self.data_controller.assert_schema() log.info("Schema has been checked") # Create crawler based on configuration self.crawler = None """:type : AbstractCrawler""" if not self.configuration.uses_mendeley: log.info("Pipeline uses FileCrawler") self.crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info( "Pipeline uses SDKCrawler".format( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret ) ) self.crawler = SDKCrawler( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret ) # Create the pipeline self.crawl_controller = CrawlController(self.crawler, self.configuration.crawler.research_group) self.analysis_controller = AnalysisController() self.pipeline_controller = PipelineController( data_controller=self.data_controller, crawl_controller=self.crawl_controller, analysis_controller=self.analysis_controller, ) log.info("Pipeline has been initialized") # Create the routing controllers self.fields_controller = FieldsController(self, self.data_controller) self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache) self.publications_controller = DocumentsController(self, self.data_controller) self.cache_controller = CacheController( self, self.data_controller, self.pipeline_controller, self.configuration ) self.root_controller = RootController(self, self.data_controller, self.configuration) # Register the routes self.register_routes() log.info("Routes have been registered") log.info("MendeleyCache has been initialized")
def get_root(self): log.info('The route GET / has been triggered') return "Welcome to the Mendeley Cache"
def get_profiles(self): log.info('The route GET /profiles/ has been triggered') # Default parameters profile_ids = '' field_ids = '' slim = False # Set passed query parameters if existing if 'profile-ids' in request.args: profile_ids = request.args['profile-ids'].split(',') log.debug('Query parameter "profile-ids" = %s' % profile_ids) if 'field-ids' in request.args: field_ids = request.args['field-ids'].split(',') log.debug('Query parameter "field_ids" = %s' % field_ids) if 'slim' in request.args: slim = bool(request.args['slim']) log.debug('Query parameter "slim" = %s' % slim) # Trigger the respective methods profiles = [] if slim: profiles = self._data_controller.api_data.get_profiles_slim() else: profiles = self._data_controller.api_data.get_profiles_by_profile_ids_or_field_ids( profile_ids=profile_ids, field_ids=field_ids ) # Pattern for cms pages page_pattern = self._cache_config.profile_page_pattern # Serialize documents response = [] for profile in profiles: profile_dict = dict(profile) # names name = None first_name = None last_name = None # Get names if 'first_name' in profile_dict and 'last_name' in profile_dict: first_name = profile_dict['first_name'] last_name = profile_dict['last_name'] elif 'name' in profile_dict: name = profile_dict['name'] name_parts = [s.lower() for i, s in enumerate(name.split())] first_name = name_parts[0] last_name = name_parts[1] # If the names are available create the page link if first_name is not None and last_name is not None: page = page_pattern page = re.sub(':firstname', first_name, page) page = re.sub(':lastname', last_name, page) profile_dict["page"] = page response.append(profile_dict) return json.dumps(response, cls=DefaultEncoder)
def __init__(self, *args, **kwargs): super(MendeleyCache, self).__init__(*args, **kwargs) # Read configuration self.configuration = ServiceConfiguration() self.configuration.load() log.info("Configuration has been loaded") # Create service controllers self.data_controller = DataController(self.configuration.database) self.data_controller.assert_schema() log.info("Schema has been checked") # Create crawler based on configuration self.crawler = None """:type : AbstractCrawler""" if not self.configuration.uses_mendeley: log.info("Pipeline uses FileCrawler") self.crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret)) self.crawler = SDKCrawler( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret) # Create the pipeline self.crawl_controller = CrawlController( self.crawler, self.configuration.crawler.research_group) self.analysis_controller = AnalysisController() self.pipeline_controller = PipelineController( data_controller=self.data_controller, crawl_controller=self.crawl_controller, analysis_controller=self.analysis_controller) log.info("Pipeline has been initialized") # Create the routing controllers self.fields_controller = FieldsController(self, self.data_controller) self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache) self.publications_controller = DocumentsController( self, self.data_controller) self.cache_controller = CacheController(self, self.data_controller, self.pipeline_controller, self.configuration) self.root_controller = RootController(self, self.data_controller, self.configuration) # Register the routes self.register_routes() log.info("Routes have been registered") log.info("MendeleyCache has been initialized")
from mendeleycache.logging import log from mendeleycache.utils.files import get_relative_path from mendeleycache.test.test_pipeline import sample_pipeline from mendeleycache.test.routes.test_api import sample_api import unittest from unittest import TestLoader import logging import sys import os import json from pprint import PrettyPrinter if len(sys.argv) >= 2: log.info("Welcome to the MendeleyCache runner") command = sys.argv[1] # Test runner if command == "tests": log.info("Disabling non-critical logs for better unittest output") # Disable logging for tests logging.disable(logging.CRITICAL) # Get project root path project_root = get_relative_path(".") # Prepare loader = TestLoader()
def execute(self, addr: str = "localhost"): """ Execute a single run of the pipeline This is later scheduled like once per day :return: """ # Run the crawler self._crawl_controller.execute() # Crawl results profiles = self._crawl_controller.profiles profile_docs = self._crawl_controller.profile_documents group_docs = self._crawl_controller.group_documents # Then pipe the data to the analysis controller self._analysis_controller.prepare(profiles, profile_docs, group_docs) self._analysis_controller.execute() # Analysis results documents = self._analysis_controller.documents unified_name_to_profiles = self._analysis_controller.unified_name_to_profiles unified_document_title_to_documents = self._analysis_controller.unified_document_title_to_documents unified_field_title_to_field = self._analysis_controller.unified_field_title_to_field unified_field_title_to_documents = self._analysis_controller.unified_field_title_to_documents unified_name_to_authored_documents = self._analysis_controller.unified_name_to_authored_documents unified_name_to_participated_documents = self._analysis_controller.unified_name_to_participated_documents # Then store the all data with the data controller self._data_controller.crawl_data.execute( profiles=profiles, documents=documents, unified_name_to_profiles=unified_name_to_profiles, unified_document_title_to_documents= unified_document_title_to_documents, unified_field_title_to_field=unified_field_title_to_field, unified_field_title_to_documents=unified_field_title_to_documents, unified_name_to_authored_documents= unified_name_to_authored_documents, unified_name_to_participated_documents= unified_name_to_participated_documents) # Count field associations field_links = 0 for title, docs in unified_field_title_to_documents.items(): field_links += len(docs) # Generate report report = PipelineReport( profiles=len(profiles), documents=len(documents), unified_profiles=len(unified_name_to_profiles), unified_documents=len(unified_document_title_to_documents), fields=len(unified_field_title_to_field), field_links=field_links) # Log update in update_log self._data_controller.crawl_data.log_update(report=report, remote_addr=addr) # Log report log.info("Pipeline has been executed\n" "\t| found {profiles} profiles\n" "\t| found {documents} documents\n" "\t| found {unified_profiles} unified profile names\n" "\t| found {unified_documents} unified document titles\n" "\t| found {fields} research fields\n" "\t| found {field_links} field links\n".format( profiles=report.profiles, documents=report.documents, unified_profiles=report.unified_profiles, unified_documents=report.unified_documents, fields=report.fields, field_links=report.field_links)) # Return report return report