Example #1
0
    def update_cache_profiles(self, unified_name_to_profiles: {}):
        """
    Given a unified_profile_name to profiles map, merges the profiles and creates the FK references
    :param unified_name_to_profiles:
    :param unified_name_to_real_name:
    :return:
    """

        sql = self._update_cache_profiles[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache profiles")
            for _, profile_list in unified_name_to_profiles.items():
                # flatten the profile list down to one profile
                reference_profile = None
                """:type : Profile"""

                for profile in profile_list:
                    if reference_profile is None or len(profile.display_name) > len(reference_profile.display_name):
                        reference_profile = profile

                # if we found at least one reference_profile (which we should)
                # add the corresponding sql insert string to the cache_profile_strings array
                if reference_profile is not None:
                    u, r = unify_profile_name(reference_profile.first_name, reference_profile.last_name)
                    b64u = generate_id(u)
                    log.info("inserting %s, %s" % (b64u, sanitize_text(r)))
                    conn.execute(sql, (b64u, sanitize_text(r)))

        log.info("Cache profiles have been updated")
Example #2
0
    def update_cache_documents(self, unified_document_title_to_documents: {}):
        """
    Given a unified_document_title to documents map, merges the documents and creates the FK references
    :param unified_document_title_to_documents:
    :return:
    """

        sql = self._update_cache_documents[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache documents")
            for _, doc_list in unified_document_title_to_documents.items():
                # flatten the document list down to one document
                reference_doc = None
                """:type : Document"""

                for doc in doc_list:
                    if reference_doc is None or doc.core_last_modified > reference_doc.core_last_modified:
                        reference_doc = doc

                # if we found at least one reference_doc (which we should),
                # add the corresponding sql insert string to the cache_document_strings array
                if reference_doc is not None:
                    u, r = unify_document_title(reference_doc.core_title)
                    b64u = generate_id(u)
                    conn.execute(sql, (b64u, sanitize_text(r)))

        log.info("Cache documents have been updated")
Example #3
0
def is_trusted_proxy(addr: str) -> bool:
    if addr is None:
        return False
    log.debug("Checking if address '%s' is a trusted proxy" % addr)
    for trusted_proxy in trusted_proxies:
        if trusted_proxy.match(addr):
            return True
    return False
Example #4
0
    def process_profile_documents(self):
        """
        Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for profile_unified in self._unified_name_to_profiles:
            found_docs = []

            profiles = self._unified_name_to_profiles[profile_unified]
            if len(profiles) == 0:
                log.warning("There were no profiles for the unified name %s" %
                            profile_unified)
                continue

            # For each profile linked to that unified name, add the found documents to the list
            for profile in profiles:
                x = self._profile_docs[profile.identifier]
                log.debug(
                    "Used {len_x} documents from id {mendeley_id} for unified name {name}"
                    .format(len_x=len(x),
                            mendeley_id=profile.identifier,
                            name=unify_profile_name(profile.first_name,
                                                    profile.last_name)))
                found_docs += x

            # Process these documents
            for doc in found_docs:
                # Add doc to all docs
                self._documents.append(doc)

                # Create unified document title
                doc_unified, doc_real = unify_document_title(doc.core_title)

                # Add document to docs
                if doc_unified in self._unified_document_title_to_documents:
                    existing_docs = self._unified_document_title_to_documents[
                        doc_unified]
                    existing_docs.append(doc)
                else:
                    self._unified_document_title_to_documents[doc_unified] = [
                        doc
                    ]

                # Append the doc title to the authored_docs of that unified profile name
                authored_docs = self._unified_name_to_authored_documents[
                    profile_unified]
                authored_docs.add(doc_unified)

                # Process core_authors field of the doc to find participants
                for author in doc.core_authors:
                    self.analyze_author(doc_unified, author)

                # Analyze the tags fields of the doc to find research fields
                for tag in doc.tags:
                    self.analyze_field_tag(doc_unified, tag)
        log.info("Profile documents have been analyzed")
Example #5
0
 def crawl_group_members(self):
     """
     Fetches members of the pre-configured research group
     :return:
     """
     self._members = self._crawler.get_group_members(self._research_group)
     log.debug(
         "{num} group members have been fetched for group_id {group_id}".
         format(num=len(self._members), group_id=self._research_group))
     log.info("Group members have been fetched")
Example #6
0
 def crawl_group_members(self):
     """
     Fetches members of the pre-configured research group
     :return:
     """
     self._members = self._crawler.get_group_members(self._research_group)
     log.debug("{num} group members have been fetched for group_id {group_id}".format(
         num=len(self._members),
         group_id=self._research_group
     ))
     log.info("Group members have been fetched")
Example #7
0
 def crawl_group_documents(self):
     """
     Fetches the publications that are associated with the pre-configured group
     :return:
     """
     self._group_documents = self._crawler.get_documents_by_group_id(self._research_group)
     log.debug("{num} documents have been fetched for group_id {group_id}".format(
         num=len(self._group_documents),
         group_id=self._research_group
     ))
     log.info("Group documents have been fetched")
Example #8
0
 def crawl_group_documents(self):
     """
     Fetches the publications that are associated with the pre-configured group
     :return:
     """
     self._group_documents = self._crawler.get_documents_by_group_id(
         self._research_group)
     log.debug(
         "{num} documents have been fetched for group_id {group_id}".format(
             num=len(self._group_documents), group_id=self._research_group))
     log.info("Group documents have been fetched")
Example #9
0
    def process_profile_documents(self):
        """
        Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for profile_unified in self._unified_name_to_profiles:
            found_docs = []

            profiles = self._unified_name_to_profiles[profile_unified]
            if len(profiles) == 0:
                log.warning("There were no profiles for the unified name %s" % profile_unified)
                continue

            # For each profile linked to that unified name, add the found documents to the list
            for profile in profiles:
                x = self._profile_docs[profile.identifier]
                log.debug("Used {len_x} documents from id {mendeley_id} for unified name {name}".format(
                    len_x=len(x),
                    mendeley_id=profile.identifier,
                    name=unify_profile_name(profile.first_name, profile.last_name)
                ))
                found_docs += x

            # Process these documents
            for doc in found_docs:
                # Add doc to all docs
                self._documents.append(doc)

                # Create unified document title
                doc_unified, doc_real = unify_document_title(doc.core_title)

                # Add document to docs
                if doc_unified in self._unified_document_title_to_documents:
                    existing_docs = self._unified_document_title_to_documents[doc_unified]
                    existing_docs.append(doc)
                else:
                    self._unified_document_title_to_documents[doc_unified] = [doc]

                # Append the doc title to the authored_docs of that unified profile name
                authored_docs = self._unified_name_to_authored_documents[profile_unified]
                authored_docs.add(doc_unified)

                # Process core_authors field of the doc to find participants
                for author in doc.core_authors:
                    self.analyze_author(doc_unified, author)

                # Analyze the tags fields of the doc to find research fields
                for tag in doc.tags:
                    self.analyze_field_tag(doc_unified, tag)
        log.info("Profile documents have been analyzed")
Example #10
0
    def update_cache_fields(self, unified_field_title_to_field: {}):
        """
    Given a unified_field_title to field map, updates the fields
    :param unified_field_title_to_field:
    :return:
    """

        sql = self._update_cache_fields[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache fields")
            for _, field in unified_field_title_to_field.items():
                b64u = generate_id(field.unified_title)
                conn.execute(sql, (b64u, sanitize_text(field.title)))

        log.info("Cache fields have been updated")
Example #11
0
def get_remote_ip():
    """
    Given a flask request, extracts the remote address
    :param request:
    :return:
    """
    # Enter your trusted proxy here.
    # With a local NGINX reverse proxy that's localhost
    # Be aware of that issue:
    # http://stackoverflow.com/questions/22868900/how-do-i-safely-get-the-users-real-ip-address-in-flask-using-mod-wsgi
    # Otherwise spoofing becomes dangerous

    route = request.access_route + [request.remote_addr]
    log.debug("Route: %s" % route)
    remote_addr = next(
        (addr for addr in reversed(route) if not is_trusted_proxy(addr)),
        request.remote_addr)
    log.debug("Choosing: '%s'" % remote_addr)
    return remote_addr
Example #12
0
    def link_fields_to_documents(self, unified_field_title_to_documents: {}):
        """
    Given a unified_field_title to documents map, creates the N:M relations in the database
    :param unified_field_title_to_documents:
    :return:
    """

        # Get the different statements in the sql file
        delete = self._link_fields_to_documents[0]
        insert = self._link_fields_to_documents[1]

        # Fire the sql scripts in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting previous field -> document links")
            conn.execute(delete)
            log.debug("Inserting new field -> document links")
            for unified_field_title, doc_list in unified_field_title_to_documents.items():
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(doc_unified), generate_id(unified_field_title)))

        log.info("Field -> document links have been updated")
Example #13
0
    def crawl_profiles(self):
        """
        Given a populated members array this function crawls the profiles linked to the ids as well as the publications
        :return:
        """
        log.debug("Adding members to worker queues")
        for member in self._members:
            self._profile_queue.put(member.profile_id)
            self._profile_documents_queue.put(member.profile_id)

        # Create profile crawlers
        log.debug("Spawning profile workers")
        for i in range(number_profile_workers):
            t = Thread(target=self.profile_worker)
            t.daemon = False
            t.start()

        # Create document crawlers
        log.debug("Spawning document crawlers")
        for i in range(number_document_workers):
            t = Thread(target=self.document_worker)
            t.daemon = False
            t.start()

        # Wait for both queues to complete
        self._profile_queue.join()
        self._profile_documents_queue.join()
        log.info("Profiles and associated documents have been fetched")
Example #14
0
    def crawl_profiles(self):
        """
        Given a populated members array this function crawls the profiles linked to the ids as well as the publications
        :return:
        """
        log.debug("Adding members to worker queues")
        for member in self._members:
            self._profile_queue.put(member.profile_id)
            self._profile_documents_queue.put(member.profile_id)

        # Create profile crawlers
        log.debug("Spawning profile workers")
        for i in range(number_profile_workers):
            t = Thread(target=self.profile_worker)
            t.daemon = False
            t.start()

        # Create document crawlers
        log.debug("Spawning document crawlers")
        for i in range(number_document_workers):
            t = Thread(target=self.document_worker)
            t.daemon = False
            t.start()

        # Wait for both queues to complete
        self._profile_queue.join()
        self._profile_documents_queue.join()
        log.info("Profiles and associated documents have been fetched")
Example #15
0
    def link_profiles_to_documents(
        self,
        unified_name_to_profiles: {},
        unified_name_to_authored_documents: {},
        unified_name_to_participated_documents: {},
    ):
        """
    Given a unified_profile_name to authored_documents and participated_documents map(s), creates the N:M relations
    in the database
    :param unified_name_to_authored_documents:
    :param unified_name_to_participated_documents:
    :return:
    """

        # Get the different statements in the sql file
        delete = self._link_profiles_to_documents[0]
        insert = self._link_profiles_to_documents[1]

        # Fire the sql scripts in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting previous profile -> document links")
            conn.execute(delete)

            log.debug("Inserting new profile -> document links")

            for unified_name, doc_list in unified_name_to_authored_documents.items():
                # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise)
                if unified_name not in unified_name_to_profiles:
                    continue
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified)))

            for unified_name, doc_list in unified_name_to_participated_documents.items():
                # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise)
                if unified_name not in unified_name_to_profiles:
                    continue
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified)))

        log.info("Profile -> document links have been updated")
Example #16
0
 def profile_worker(self):
     """
     Given a prefilled profile queue this worker will pop an id
     and fetch the associated profile
     :return:
     """
     while not self._profile_queue.empty():
         profile_id = self._profile_queue.get()
         try:
             # Fetch the profile
             profile = self._crawler.get_profile_by_id(profile_id)
             self._profiles.append(profile)
             log.debug(
                 "The profile for profile_id {profile_id} has been fetched".
                 format(profile_id=profile_id))
             # Mark task as done
             self._profile_queue.task_done()
         except Exception as e:
             log.warn(
                 "Failed to fetch the profile for profile_id {profile_id}".
                 format(profile_id=profile_id))
             self._profile_queue.task_done()
Example #17
0
 def profile_worker(self):
     """
     Given a prefilled profile queue this worker will pop an id
     and fetch the associated profile
     :return:
     """
     while not self._profile_queue.empty():
         profile_id = self._profile_queue.get()
         try:
             # Fetch the profile
             profile = self._crawler.get_profile_by_id(profile_id)
             self._profiles.append(profile)
             log.debug("The profile for profile_id {profile_id} has been fetched".format(
                 profile_id=profile_id
             ))
             # Mark task as done
             self._profile_queue.task_done()
         except Exception as e:
             log.warn("Failed to fetch the profile for profile_id {profile_id}".format(
                 profile_id=profile_id
             ))
             self._profile_queue.task_done()
Example #18
0
    def get_profiles(self):
        log.info('The route GET /profiles/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        slim = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field_ids" = %s' % field_ids)
        if 'slim' in request.args:
            slim = bool(request.args['slim'])
            log.debug('Query parameter "slim" = %s' % slim)

        # Trigger the respective methods
        profiles = []
        if slim:
            profiles = self._data_controller.api_data.get_profiles_slim()
        else:
            profiles = self._data_controller.api_data.get_profiles_by_profile_ids_or_field_ids(
                profile_ids=profile_ids, field_ids=field_ids)

        # Pattern for cms pages
        page_pattern = self._cache_config.profile_page_pattern

        # Serialize documents
        response = []
        for profile in profiles:
            profile_dict = dict(profile)

            # names
            name = None
            first_name = None
            last_name = None

            # Get names
            if 'first_name' in profile_dict and 'last_name' in profile_dict:
                first_name = profile_dict['first_name']
                last_name = profile_dict['last_name']
            elif 'name' in profile_dict:
                name = profile_dict['name']
                name_parts = [s.lower() for i, s in enumerate(name.split())]
                first_name = name_parts[0]
                last_name = name_parts[1]

            # If the names are available create the page link
            if first_name is not None and last_name is not None:
                page = page_pattern
                page = re.sub(':firstname', first_name, page)
                page = re.sub(':lastname', last_name, page)
                profile_dict["page"] = page

            response.append(profile_dict)
        return json.dumps(response, cls=DefaultEncoder)
Example #19
0
 def document_worker(self):
     """
     Given a prefilled profile_documents queue this worker will pop an id
     and fetch the associated documents
     :return:
     """
     while not self._profile_documents_queue.empty():
         profile_id = self._profile_documents_queue.get()
         try:
             # Fetch the document
             documents = self._crawler.get_documents_by_profile_id(
                 profile_id)
             self._profile_documents[profile_id] = documents
             log.debug(
                 "{num} documents have been fetched for profile_id {profile_id}"
                 .format(num=len(documents), profile_id=profile_id))
             # Mark task as done
             self._profile_documents_queue.task_done()
         except Exception as e:
             log.warn(
                 "Failed to fetch documents for profile_id {profile_id}".
                 format(profile_id=profile_id))
             self._profile_documents_queue.task_done()
Example #20
0
 def document_worker(self):
     """
     Given a prefilled profile_documents queue this worker will pop an id
     and fetch the associated documents
     :return:
     """
     while not self._profile_documents_queue.empty():
         profile_id = self._profile_documents_queue.get()
         try:
             # Fetch the document
             documents = self._crawler.get_documents_by_profile_id(profile_id)
             self._profile_documents[profile_id] = documents
             log.debug("{num} documents have been fetched for profile_id {profile_id}".format(
                 num=len(documents),
                 profile_id=profile_id
             ))
             # Mark task as done
             self._profile_documents_queue.task_done()
         except Exception as e:
             log.warn("Failed to fetch documents for profile_id {profile_id}".format(
                 profile_id=profile_id
             ))
             self._profile_documents_queue.task_done()
Example #21
0
    def get_profiles(self):
        log.info('The route GET /profiles/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        slim = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field_ids" = %s' % field_ids)
        if 'slim' in request.args:
            slim = bool(request.args['slim'])
            log.debug('Query parameter "slim" = %s' % slim)

        # Trigger the respective methods
        profiles = []
        if slim:
            profiles = self._data_controller.api_data.get_profiles_slim()
        else:
            profiles = self._data_controller.api_data.get_profiles_by_profile_ids_or_field_ids(
                profile_ids=profile_ids,
                field_ids=field_ids
            )

        # Pattern for cms pages
        page_pattern = self._cache_config.profile_page_pattern

        # Serialize documents
        response = []
        for profile in profiles:
            profile_dict = dict(profile)

            # names
            name = None
            first_name = None
            last_name = None


            # Get names
            if 'first_name' in profile_dict and 'last_name' in profile_dict:
                first_name = profile_dict['first_name']
                last_name = profile_dict['last_name']
            elif 'name' in profile_dict:
                name = profile_dict['name']
                name_parts = [s.lower() for i, s in enumerate(name.split())]
                first_name = name_parts[0]
                last_name = name_parts[1]

            # If the names are available create the page link
            if first_name is not None and last_name is not None:
                page = page_pattern
                page = re.sub(':firstname', first_name, page)
                page = re.sub(':lastname', last_name, page)
                profile_dict["page"] = page

            response.append(profile_dict)
        return json.dumps(response, cls=DefaultEncoder)
Example #22
0
 def __init__(self, profile_page_pattern: str):
     self._profile_page_pattern = profile_page_pattern
     log.debug("Using profile_page_pattern: %s" % profile_page_pattern)
Example #23
0
    def get_documents_by_profile_ids_and_field_ids(self,
                                                   profile_ids: [int], field_ids: [int],
                                                   order_attr: str="year", order_dir: str="desc",
                                                   limit: int=0, offset: int=0, only_count: bool=False):
        """
        Given profile ids and field ids, queries all documents that belong to the research field
        AND are associated with these profiles
        :return:
        """

        profile_ids_string = ""
        field_ids_string = ""
        query_limit = 20
        query_offset = 0
        query_order_attr = "pub_year"
        query_order_dir = "ASC"
        if len(profile_ids) > 0:
            profile_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, profile_ids)))
        else:
            profile_ids_string = "(NULL)"

        if len(field_ids) > 0:
            field_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, field_ids)))
        else:
            field_ids_string = "(NULL)"

        # Check order attribute parameter
        if order_attr == "year":
            query_order_attr = "d.pub_year"
        elif order_attr == "title":
            query_order_attr = "d.title"
        elif order_attr == "source":
            query_order_attr = "d.source"

        # Check order direction
        if order_dir == "desc":
            query_order_dir = "DESC"
        elif order_dir == "asc":
            query_order_dir = "ASC"

        # Check limit parameter
        if limit > 0:
            query_limit = limit

        # Check offset parameter
        if offset > 0:
            query_offset = offset

        # If no profile_ids and field_ids have been passed, i need to return everything
        # && use query without AND xx IN ()
        query = ""
        if len(profile_ids) > 0 and len(field_ids) > 0:
            query = self._query_documents_by_profile_ids_and_field_ids[0]
            query = re.sub(':profile_ids', profile_ids_string, query)
            query = re.sub(':field_ids', field_ids_string, query)
        elif len(profile_ids) > 0 and len(field_ids) == 0:
            query = self._query_documents_by_profile_ids[0]
            query = re.sub(':profile_ids', profile_ids_string, query)
        elif len(profile_ids) == 0 and len(field_ids) > 0:
            query = self._query_documents_by_field_ids[0]
            query = re.sub(':field_ids', field_ids_string, query)
        else:
            query = self._query_all_documents[0]

        if only_count:
            select = "SELECT COUNT(DISTINCT cd.id) AS cnt FROM"
            query = re.sub(query_head, select, query)
            query = re.sub('ORDER BY :order_by', '', query)
            query = re.sub('LIMIT :query_limit', '',  query)
        else:
            select = str(
                "DISTINCT "
                "cd.id             AS id,"
                "d.mendeley_id     AS mendeley_id,"
                "d.title           AS title,"
                "d.doc_type        AS doc_type,"
                "d.last_modified   AS last_modified,"
                "d.abstract        AS abstract,"
                "d.source          AS source,"
                "d.pub_year        AS pub_year,"
                "d.authors         AS authors,"
                "d.keywords        AS keywords,"
                "d.tags            AS tags,"
                "d.derived_bibtex  AS derived_bibtex")
            query = re.sub(':select_attributes', select, query)

            # Substitute order_by and query_limit as well
            query = re.sub(':order_by', '{order_attr} {order_dir}'.format(
                order_attr=query_order_attr,
                order_dir=query_order_dir
            ), query)
            query = re.sub(':query_limit', '{offset},{limit}'.format(
                offset=query_offset,
                limit=query_limit
            ), query)

        log.info("Querying documents by profile_ids and field_ids\n"
                 "\t| profile_ids: {profile_ids}\n"
                 "\t| field_ids: {field_ids}\n"
                 "\t| order_attr: {order_attr}\n"
                 "\t| order_dir: {order_dir}\n"
                 "\t| offset: {offset}\n"
                 "\t| limit: {limit}\n"
                 "\t| only_count: {only_count}".format(
            profile_ids=profile_ids_string,
            field_ids=field_ids_string,
            order_attr=query_order_attr,
            order_dir=query_order_dir,
            offset=query_offset,
            limit=query_limit,
            only_count=only_count
        ))
        log.debug("Query: {query}".format(query=query))

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            return conn.execute(query).fetchall()
Example #24
0
    def update_profiles(self, profiles: [Profile]):
        """
    Given a profile list, this method replaces the profiles in the database with new ones
    :param docs:
    :return:
    """

        def insert_profile(conn: Connection, insert: str, p: Profile):
            u, _ = unify_profile_name(p.first_name, p.last_name)
            b64u = generate_id(u)
            conn.execute(
                insert,
                (
                    sanitize_text(p.identifier),
                    b64u,
                    sanitize_text(p.first_name),
                    sanitize_text(p.last_name),
                    sanitize_text(p.display_name),
                    sanitize_text(p.link),
                ),
            )

        # If there's nothing to insert, abort
        if len(profiles) == 0:
            return None

        delete = self._replace_profiles[0]
        insert = self._replace_profiles[1]
        temp = self._replace_profiles[2]
        temp_insert = self._replace_profiles[3]
        update = self._replace_profiles[4]
        temp_drop = self._replace_profiles[5]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting existing profiles")
            conn.execute(delete)

            log.debug("Inserting new profiles")
            for profile in profiles:
                insert_profile(conn, insert, profile)

            log.debug("Creating temporary table")
            conn.execute(temp)

            log.debug("Spooling data into temporary table")
            conn.execute(temp_insert)

            log.debug("Creating profile links")
            conn.execute(update)

            log.debug("Dropping temporary table")
            conn.execute(temp_drop)

        log.info("Profiles have been updated")
Example #25
0
    def update_documents(self, docs: [Document]):
        """
    Given a document list, this method replaces the documents in the database with new ones
    :param docs:
    :return:
    """

        def insert_doc(conn: Connection, insert: str, doc: Document):
            u, _ = unify_document_title(doc.core_title)
            b64u = generate_id(u)
            author_string = map(lambda x: "{first} {last}".format(first=x[0], last=x[1]), doc.core_authors)

            # Create strings
            authors_string = ", ".join(author_string)
            keywords_string = ", ".join(doc.core_keywords)
            tags_string = ", ".join(doc.tags)

            # Create bibtex
            bibtex = generate_bibtex(doc)

            # Insert tuple
            conn.execute(
                insert,
                (
                    sanitize_text(doc.core_id),
                    b64u,
                    sanitize_text(doc.core_profile_id),
                    sanitize_text(doc.core_title),
                    sanitize_text(doc.core_type),
                    datetime_to_sqltime(doc.core_created),
                    datetime_to_sqltime(doc.core_last_modified),
                    sanitize_text(doc.core_abstract),
                    sanitize_text(doc.core_source),
                    doc.core_year,
                    sanitize_text(authors_string),
                    sanitize_text(keywords_string),
                    sanitize_text(tags_string),
                    sanitize_text(doc.doc_website),
                    sanitize_text(doc.conf_website),
                    doc.conf_month,
                    sanitize_text(doc.conf_pages),
                    sanitize_text(doc.conf_city),
                    sanitize_text(bibtex),
                ),
            )

        # If there's nothing to insert, abort
        if len(docs) == 0:
            return None

        delete = self._replace_documents[0]
        insert = self._replace_documents[1]
        temp = self._replace_documents[2]
        temp_insert = self._replace_documents[3]
        update = self._replace_documents[4]
        temp_drop = self._replace_documents[5]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting existing documents")
            conn.execute(delete)

            log.debug("Inserting new documents")
            for doc in docs:
                insert_doc(conn, insert, doc)

            log.debug("Creating temporary table")
            conn.execute(temp)

            log.debug("Spooling data into temporary table")
            conn.execute(temp_insert)

            log.debug("Creating document links")
            conn.execute(update)

            log.debug("Dropping temporary table")
            conn.execute(temp_drop)

        log.info("Documents have been updated")
Example #26
0
    def get_documents(self):
        log.info('The route GET /documents/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        limit = 0
        offset = 0
        order_dir = ""
        order_attr = ""
        only_count = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field-ids" = %s' % field_ids)
        if 'limit' in request.args:
            limit = int(request.args['limit'])
            log.debug('Query parameter "limit" = %s' % limit)
        if 'offset' in request.args:
            offset = int(request.args['offset'])
            log.debug('Query parameter "offset" = %s' % offset)
        if 'order-dir' in request.args:
            order_dir = request.args['order-dir']
            log.debug('Query parameter "order-dir" = %s' % order_dir)
        if 'order-attr' in request.args:
            order_attr = request.args['order-attr']
            log.debug('Query parameter "order-attr" = %s' % order_attr)
        if 'only-count' in request.args:
            only_count = bool(request.args['only-count'])
            log.debug('Query parameter "only-count" = %s' % only_count)

        # Trigger the respective methods
        data = self._data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            profile_ids=profile_ids,
            field_ids=field_ids,
            order_attr=order_attr,
            order_dir=order_dir,
            offset=offset,
            limit=limit,
            only_count=only_count)

        if only_count:
            # Extract count
            response = []
            for document in data:
                document_dict = dict(document.items())
                response.append(document_dict)

            if len(response) > 0:
                return json.dumps(response[0], cls=DefaultEncoder)
            else:
                return json.dumps({"cnt": 0}, cls=DefaultEncoder)
        else:
            # Serialize documents
            response = []
            for document in data:
                document_dict = dict(document.items())
                response.append(document_dict)
            return json.dumps(response, cls=DefaultEncoder)
Example #27
0
    def get_documents(self):
        log.info('The route GET /documents/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        limit = 0
        offset = 0
        order_dir = ""
        order_attr = ""
        only_count = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field-ids" = %s' % field_ids)
        if 'limit' in request.args:
            limit = int(request.args['limit'])
            log.debug('Query parameter "limit" = %s' % limit)
        if 'offset' in request.args:
            offset = int(request.args['offset'])
            log.debug('Query parameter "offset" = %s' % offset)
        if 'order-dir' in request.args:
            order_dir = request.args['order-dir']
            log.debug('Query parameter "order-dir" = %s' % order_dir)
        if 'order-attr' in request.args:
            order_attr = request.args['order-attr']
            log.debug('Query parameter "order-attr" = %s' % order_attr)
        if 'only-count' in request.args:
            only_count = bool(request.args['only-count'])
            log.debug('Query parameter "only-count" = %s' % only_count)

        # Trigger the respective methods
        data = self._data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            profile_ids=profile_ids,
            field_ids=field_ids,
            order_attr=order_attr,
            order_dir=order_dir,
            offset=offset,
            limit=limit,
            only_count=only_count
        )

        if only_count:
            # Extract count
            response = []
            for document in data:
                document_dict = dict(document.items())
                response.append(document_dict)

            if len(response) > 0:
                return json.dumps(response[0], cls=DefaultEncoder)
            else:
                return json.dumps({"cnt": 0}, cls=DefaultEncoder)
        else:
            # Serialize documents
            response = []
            for document in data:
                document_dict = dict(document.items())
                response.append(document_dict)
            return json.dumps(response, cls=DefaultEncoder)