Example #1
0
    def get_all_records_columns_server(self,
                                       sort_by='_id',
                                       sort_direction=-1,
                                       projection=dict(),
                                       filter_by=dict(),
                                       search_term=str(),
                                       limit=0,
                                       skip=0):

        filter_by["deleted"] = data_utils.get_not_deleted_flag()

        # 'name' seems to be the only reasonable field to restrict searching; others fields are resolved
        filter_by["name"] = {'$regex': search_term, "$options": 'i'}

        if self.profile_id:
            filter_by["profile_id"] = self.profile_id

        if skip > 0:
            records = self.get_collection_handle().find(
                filter_by,
                projection).sort([[sort_by,
                                   sort_direction]]).skip(skip).limit(limit)
        else:
            records = self.get_collection_handle().find(
                filter_by, projection).sort([[sort_by,
                                              sort_direction]]).limit(limit)

        return cursor_to_list(records)
Example #2
0
    def get_counts(self):
        """
        Method to return current numbers of Publication, Person, Data,
        Sample and Submission objects in the given profile
        :return: Dictionary containing the data
        """
        num_dict = dict(num_pub="publication",
                        num_person="person",
                        num_data="datafile",
                        num_sample="sample",
                        num_submission="submission",
                        num_annotation="annotation")

        status = dict()

        for k, v in num_dict.items():
            if handle_dict.get(v, None):
                status[k] = handle_dict.get(v).count({
                    'profile_id':
                    self.profile_id,
                    'deleted':
                    data_utils.get_not_deleted_flag()
                })

        return status
Example #3
0
    def get_all_records(self, sort_by='_id', sort_direction=-1, **kwargs):
        doc = dict(deleted=data_utils.get_not_deleted_flag())
        if self.profile_id:
            doc["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(doc).sort(
            [[sort_by, sort_direction]]))
Example #4
0
    def get_by_file_id(self, file_id=None):
        docs = None
        if file_id:
            docs = self.get_collection_handle().find_one(
                {"file_id": file_id, "deleted": data_utils.get_not_deleted_flag()})

        return docs
Example #5
0
 def source_count(self):
     return handle_dict.get("source").count({
         'profile_id':
         self.profile_id,
         'deleted':
         data_utils.get_not_deleted_flag()
     })
Example #6
0
    def get_by_file_name_id(self, file_id=None):
        docs = None
        if file_id:
            docs = self.get_collection_handle().find_one(
                {
                    "_id": ObjectId(file_id),
                    "deleted": data_utils.get_not_deleted_flag()
                }, {"name": 1})

        return docs
Example #7
0
    def get_all_records_columns(self,
                                sort_by='_id',
                                sort_direction=-1,
                                projection=dict(),
                                filter_by=dict()):
        filter_by["deleted"] = data_utils.get_not_deleted_flag()
        if self.profile_id:
            filter_by["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(
            filter_by, projection).sort([[sort_by, sort_direction]]))
Example #8
0
    def get_for_user(self, user=None):
        if not user:
            user = ThreadLocal.get_current_user().id

        docs = self.get_collection_handle().find({"user_id": user, "deleted": data_utils.get_not_deleted_flag()}).sort(
            [['_id', -1]])

        if docs:
            return docs
        else:
            return None
Example #9
0
    def get_component_count(self):
        count = 0
        if self.get_collection_handle():
            count = self.get_collection_handle().count({
                'profile_id':
                self.profile_id,
                'deleted':
                data_utils.get_not_deleted_flag()
            })

        return count
Example #10
0
    def save_record(self, auto_fields=dict(), **kwargs):
        fields = dict()

        # set auto fields
        if auto_fields:
            fields = DecoupleFormSubmission(auto_fields, self.get_schema().get("schema")).get_schema_fields_updated()

        # should have target_id for updates and return empty string for inserts
        target_id = kwargs.pop("target_id", str())

        # set system fields
        system_fields = dict(
            date_modified=data_utils.get_datetime(),
            deleted=data_utils.get_not_deleted_flag()
        )

        if not target_id:
            system_fields["date_created"] = data_utils.get_datetime()
            system_fields["profile_id"] = self.profile_id

        # extend system fields
        for k, v in kwargs.items():
            system_fields[k] = v

        # add system fields to 'fields' and set default values - insert mode only
        for f in self.get_schema().get("schema"):
            f_id = f.id.split(".")[-1]

            if f_id in system_fields:
                fields[f_id] = system_fields.get(f_id)

            if not target_id and f_id not in fields:
                fields[f_id] = data_utils.default_jsontype(f.type)

        # if True, then the database action (to save/update) is never performed, but validated 'fields' is returned
        validate_only = kwargs.pop("validate_only", False)

        # prefer this testto save guard against all sorts of value the 'validate_only' can assume
        if validate_only == True:
            return fields
        else:
            if target_id:
                self.get_collection_handle().update(
                    {"_id": ObjectId(target_id)},
                    {'$set': fields})
            else:
                doc = self.get_collection_handle().insert(fields)
                target_id = str(doc)

            # return saved record
            rec = self.get_record(target_id)

            return rec
Example #11
0
    def save_record(self, auto_fields=dict(), **kwargs):
        fields = dict()
        schema = kwargs.get("schema", list()) or self.get_component_schema()

        # set auto fields
        if auto_fields:
            fields = DecoupleFormSubmission(
                auto_fields, schema).get_schema_fields_updated_dict()

        # should have target_id for updates and return empty string for inserts
        target_id = kwargs.pop("target_id", str())

        # set system fields
        system_fields = dict(date_modified=data_utils.get_datetime(),
                             deleted=data_utils.get_not_deleted_flag())

        if not target_id:
            system_fields["date_created"] = data_utils.get_datetime()
            system_fields["profile_id"] = self.profile_id

        # extend system fields
        for k, v in kwargs.items():
            system_fields[k] = v

        # add system fields to 'fields' and set default values - insert mode only
        for f in schema:
            f_id = f["id"].split(".")[-1]

            if f_id in system_fields:
                fields[f_id] = system_fields.get(f_id)

            if not target_id and f_id not in fields:
                fields[f_id] = data_utils.default_jsontype(f["type"])

        # if True, then the database action (to save/update) is never performed, but validated 'fields' are returned
        validate_only = kwargs.pop("validate_only", False)

        if validate_only is True:
            return fields
        else:
            if target_id:
                self.get_collection_handle().update(
                    {"_id": ObjectId(target_id)}, {'$set': fields})
            else:
                doc = self.get_collection_handle().insert(fields)
                target_id = str(doc)

            # return saved record
            rec = self.get_record(target_id)

            return rec
Example #12
0
    def get_for_user(self, user=None):
        if not user:
            user = data_utils.get_current_user().id
        docs = self.get_collection_handle().find({
            "user_id":
            user,
            "deleted":
            data_utils.get_not_deleted_flag()
        }).sort([['_id', -1]])

        if docs:
            return docs
        else:
            return None
Example #13
0
    def get_counts(self):
        """
        Method to return current numbers of Publication, Person, Data,
        Sample and Submission objects in the given profile
        :return: Dictionary containing the data
        """

        num_dict = dict(num_pub="publication",
                        num_person="person",
                        num_data="datafile",
                        num_sample="sample",
                        num_submission="submission",
                        num_annotation="annotation"
                        )

        status = dict()

        for k, v in num_dict.items():
            if handle_dict.get(v, None):
                status[k] = handle_dict.get(v).count(
                    {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()})

        return status
Example #14
0
    def get_shared_for_user(self, user=None):
        # get profiles shared with user
        if not user:
            user = data_utils.get_current_user().id
        groups = CopoGroup().Group.find({'member_ids': str(user)})

        p_list = list()
        for g in groups:
            gp = dict(g)
            p_list.extend(gp['shared_profile_ids'])
        # remove duplicates
        # p_list = list(set(p_list))
        docs = self.get_collection_handle().find({
            "_id": {
                "$in": p_list
            },
            "deleted":
            data_utils.get_not_deleted_flag()
        })
        out = list(docs)
        for d in out:
            d['shared'] = True

        return out
Example #15
0
 def get_all_records(self):
     doc = {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()}
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Example #16
0
 def get_all_records(self):
     doc = {
         'profile_id': self.profile_id,
         'deleted': data_utils.get_not_deleted_flag()
     }
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Example #17
0
 def get_by_datafile(self, datafile_id):
     doc = {
         'datafile_id': ObjectId(datafile_id),
         'deleted': data_utils.get_not_deleted_flag()
     }
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Example #18
0
    def get_all_records(self, sort_by='_id', sort_direction=-1):
        doc = dict(deleted=data_utils.get_not_deleted_flag())
        if self.profile_id:
            doc["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(doc).sort([[sort_by, sort_direction]]))
Example #19
0
 def get_by_datafile(self, datafile_id):
     doc = {'datafile_id': ObjectId(datafile_id), 'deleted': data_utils.get_not_deleted_flag()}
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Example #20
0
    def get_sra_samples(self, submission_location=str()):
        """
        function retrieves study samples and presents them in a format for building an sra sample set
        :param submission_location:
        :return:
        """

        sra_samples = list()

        # get datafiles
        datafiles = cursor_to_list(ghlper.get_datafiles_handle().find(
            {
                "description_token": self.description_token,
                'deleted': data_utils.get_not_deleted_flag()
            }, {
                '_id': 1,
                'file_location': 1,
                "description.attributes": 1,
                "name": 1,
                "file_hash": 1
            }))

        if not len(datafiles):
            self.__converter_errors.append("No datafiles found in submission!")
            return sra_samples

        df = pd.DataFrame(datafiles)
        df['file_id'] = df._id.astype(str)
        df['file_path'] = df['file_location'].fillna('')
        df['upload_status'] = False

        df = df[['file_id', 'file_path', 'upload_status']]
        bundle = list(df.file_id)
        bundle_meta = df.to_dict('records')

        submission_record = dict(bundle=bundle, bundle_meta=bundle_meta)

        ghlper.get_submission_handle().update(
            {"_id": ObjectId(self.submission_id)}, {'$set': submission_record})

        samples_id = list()
        df_attributes = []  # datafiles attributes

        for datafile in datafiles:
            datafile_attributes = [
                v for k, v in datafile.get("description", dict()).get(
                    "attributes", dict()).items()
            ]
            new_dict = dict()
            for d in datafile_attributes:
                new_dict.update(d)

            new_dict['datafile_id'] = str(datafile['_id'])
            new_dict['datafile_name'] = datafile.get('name', str())
            new_dict['datafile_hash'] = datafile.get('file_hash', str())
            new_dict['datafile_location'] = datafile.get(
                'file_location', str())

            df_attributes.append(new_dict)

        # process datafiles attributes
        df_attributes_df = pd.DataFrame(df_attributes)
        df_columns = df_attributes_df.columns

        # replace null values
        for k in df_columns:
            df_attributes_df[k].fillna('', inplace=True)

        if 'study_samples' in df_columns:
            df_attributes_df['study_samples'] = df_attributes_df[
                'study_samples'].apply(lambda x: x[0] if isinstance(x, list)
                                       else x.split(",")[-1])
            samples_id = list(df_attributes_df['study_samples'].unique())
            samples_id = [x for x in samples_id if x]

        if not samples_id:
            self.__converter_errors.append(
                "No samples associated with datafiles!")
            return sra_samples

        file_path = os.path.join(submission_location, "datafiles.csv")
        df_attributes_df.to_csv(path_or_buf=file_path, index=False)

        samples_id_object_list = [
            ObjectId(sample_id) for sample_id in samples_id
        ]

        sample_records = ghlper.get_samples_handle().find(
            {"_id": {
                "$in": samples_id_object_list
            }})

        # get sources
        sources = ghlper.get_sources_handle().find({
            "profile_id":
            self.profile_id,
            'deleted':
            data_utils.get_not_deleted_flag()
        })

        sra_sources = dict()

        for source in sources:
            sra_source = dict()
            sra_sources[str(source["_id"])] = sra_source

            sra_source["name"] = source["name"]
            sra_source["taxon_id"] = source.get("organism", dict()).get(
                'termAccession', str())
            if 'NCBITaxon_' in sra_source["taxon_id"]:
                sra_source["taxon_id"] = sra_source["taxon_id"].split(
                    'NCBITaxon_')[-1]

            sra_source["scientific_name"] = source.get("organism", dict()).get(
                'annotationValue', str())
            sra_source['attributes'] = self.get_attributes(
                source.get("characteristics", list()))
            sra_source[
                'attributes'] = sra_source['attributes'] + self.get_attributes(
                    source.get("factorValues", list()))

        for sample in sample_records:
            sra_sample = dict()
            sra_sample['sample_id'] = str(sample['_id'])
            sra_sample['name'] = sample['name']
            sra_sample['attributes'] = self.get_attributes(
                sample.get("characteristics", list()))
            sra_sample[
                'attributes'] = sra_sample['attributes'] + self.get_attributes(
                    sample.get("factorValues", list()))

            # retrieve sample source
            source_id = sample.get("derivesFrom", list())
            source_id = source_id[0] if source_id else ''
            sample_source = sra_sources.get(source_id, dict())

            if sample_source:
                sra_sample['attributes'].append(
                    dict(tag="Source Name",
                         value=sample_source.get("name", str())))
            else:
                self.__converter_errors.append("Sample: " + sample['name'] +
                                               " has no source information")

            if sample_source.get("taxon_id", str()):
                sra_sample['taxon_id'] = sample_source.get("taxon_id", str())
            else:
                self.__converter_errors.append(
                    "Sample: " + sample['name'] +
                    " has no TAXON_ID. Please make sure an organism has "
                    "been set for the source of this sample from the NCBITAXON ontology."
                )

            if sample_source.get("scientific_name", str()):
                sra_sample['scientific_name'] = sample_source.get(
                    "scientific_name", str())
            else:
                self.__converter_errors.append(
                    "Sample: " + sample['name'] +
                    " has no SCIENTIFIC_NAME. Please make sure an organism has "
                    "been set for the source of this sample from an ontology.")

            if sample_source.get("attributes", list()):
                sra_sample['attributes'] = sra_sample[
                    'attributes'] + sample_source.get("attributes", list())

            sra_samples.append(sra_sample)

        return sra_samples
Example #21
0
def generate_server_side_table_records(profile_id=str(), component=str(), request=dict()):
    # function generates component records for building an UI table using server-side processing
    # - please note that for effective data display,
    # all array and object-type fields (e.g., characteristics) are deferred to sub-table display.
    # please define such in the schema as "show_in_table": false and "show_as_attribute": true

    data_set = list()

    n_size = int(request.get("length", 10))  # assumes 10 records per page if length not set
    draw = int(request.get("draw", 1))
    start = int(request.get("start", 0))

    # instantiate data access object
    da_object = DAComponent(profile_id, component)

    return_dict = dict()

    records_total = da_object.get_collection_handle().count(
        {'profile_id': profile_id, 'deleted': data_utils.get_not_deleted_flag()})

    # retrieve and process records
    filter_by = dict()

    if component == "datafile":
        # get all active bundles in the profile
        existing_bundles = Description().get_all_records_columns(projection=dict(_id=1),
                                                                 filter_by=dict(profile_id=profile_id,
                                                                                component=component))
        existing_bundles = [str(x["_id"]) for x in existing_bundles]
        records_total = da_object.get_collection_handle().count({"$and": [
            {"profile_id": profile_id, 'deleted': data_utils.get_not_deleted_flag()},
            {"$or": [
                {"description_token": {"$in": [None, False, ""]}},
                {"description_token": {"$nin": existing_bundles}}]}
        ]})

        filter_by = {"$or": [
            {"description_token": {"$in": [None, False, ""]}},
            {"description_token": {"$nin": existing_bundles}}]}

    # get and filter schema elements based on displayable columns
    schema = [x for x in da_object.get_schema().get("schema_dict") if x.get("show_in_table", True)]

    # build db column projection
    projection = [(x["id"].split(".")[-1], 1) for x in schema]

    # order by
    sort_by = request.get('order[0][column]', '0')
    sort_by = request.get('columns[' + sort_by + '][data]', '')
    sort_direction = request.get('order[0][dir]', 'asc')

    sort_by = '_id' if not sort_by else sort_by
    sort_direction = 1 if sort_direction == 'asc' else -1

    # search
    search_term = request.get('search[value]', '').strip()

    records = da_object.get_all_records_columns_server(sort_by=sort_by, sort_direction=sort_direction,
                                                       search_term=search_term, projection=dict(projection),
                                                       limit=n_size, skip=start, filter_by=filter_by)

    records_filtered = records_total

    if search_term:
        records_filtered = da_object.get_collection_handle().count(
            {'profile_id': profile_id, 'deleted': data_utils.get_not_deleted_flag(),
             'name': {'$regex': search_term, "$options": 'i'}})

    if records:
        df = pd.DataFrame(records)

        df['record_id'] = df._id.astype(str)
        df["DT_RowId"] = df.record_id
        df.DT_RowId = 'row_' + df.DT_RowId
        df = df.drop('_id', axis='columns')

        for x in schema:
            x["id"] = x["id"].split(".")[-1]
            df[x["id"]] = df[x["id"]].apply(resolve_control_output_apply, args=(x,)).astype(str)

        data_set = df.to_dict('records')

    return_dict["records_total"] = records_total
    return_dict["records_filtered"] = records_filtered
    return_dict["data_set"] = data_set
    return_dict["draw"] = draw

    return return_dict
Example #22
0
 def source_count(self):
     return handle_dict.get("source").count(
         {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()})
Example #23
0
    def perform_datafile_pairing(self, next_stage_index):
        """
        stage callback function: determines if the pairing of datafiles should be performed given the 'library_layout'
        :param next_stage_index:
        :return:
        """

        description = Description().GET(self.__wzh.description_token)
        stages = description["stages"]
        attributes = description["attributes"]
        meta = description.get("meta", dict())

        # validate stage
        stage = dict()

        if next_stage_index < len(stages):
            stage = stages[next_stage_index]

        # first, target repository
        relevant_repos = [
            "ena"
        ]  # add a repo to this list if it requires datafile pairing

        target_repository = attributes.get("target_repository",
                                           dict()).get("deposition_context",
                                                       str())

        if target_repository not in relevant_repos:
            # no items to pair, clear any previous pairing information
            self.remove_pairing_info(stage["ref"], attributes, meta)

            return False

        # get records in bundle
        records = cursor_to_list(DataFile().get_collection_handle().find(
            {
                "$and": [{
                    "description_token": self.__wzh.description_token,
                    'deleted': d_utils.get_not_deleted_flag()
                }, {
                    'description.attributes': {
                        "$exists": True
                    }
                }]
            }, {
                'description.attributes': 1,
                'name': 1
            }))

        if not records:
            # no items to pair, clear any previous pairing information
            self.remove_pairing_info(stage["ref"], attributes, meta)

            return False

        for rec in records:
            datafile_attributes = [
                v for k, v in rec['description'].get('attributes',
                                                     dict()).items()
            ]

            new_dict = dict()
            for d in datafile_attributes:
                new_dict.update(d)

            rec['attributes'] = new_dict
            rec['pairing'] = rec['attributes'].get('library_layout',
                                                   '').upper()

        df = pd.DataFrame(records)
        df._id = df['_id'].astype(str)
        df.index = df._id

        df = df[df['pairing'] == 'PAIRED']

        if not len(df):
            # no items to pair, clear any previous pairing information
            self.remove_pairing_info(stage["ref"], attributes, meta)

            return False

        # remove extraneous columns
        df = df.drop(columns=['description'])

        if not len(df) % 2 == 0:
            stage["error"] = "Pairing requires even number of datafiles!"
            stage["refresh_wizard"] = True
        else:
            # get previously pairing candidates
            paired_candidates_old = meta.get(
                stage["ref"] + "_paired_candidates", list())
            paired_candidates = list(df.index)

            paired_candidates_old.sort()
            paired_candidates.sort()

            if not paired_candidates_old == paired_candidates:
                stage["refresh_wizard"] = True

            # if there's a valid stored map, use it
            stage_data = list()
            saved_copy = attributes.get(stage["ref"], list())

            if saved_copy:
                stored_pairs_df = pd.DataFrame(saved_copy)
                stored_pairs_list = list(stored_pairs_df._id) + list(
                    stored_pairs_df._id2)
                stored_pairs_list.sort()

                if stored_pairs_list == paired_candidates:
                    df_dict = df.to_dict()
                    df_dict = df_dict["name"]

                    stored_pairs_df["name"] = stored_pairs_df['_id'].apply(
                        lambda x: str(df_dict[x]))
                    stored_pairs_df["name2"] = stored_pairs_df['_id2'].apply(
                        lambda x: str(df_dict[x]))

                    df_result = stored_pairs_df[['name', 'name2']]
                    df_result.columns = ['file1', 'file2']

                    stage_data = df_result.to_dict('records')

            if not stage_data:
                # define fresh pairing map

                # sort by file name to reflect pairing
                df = df.sort_values(by=['name'])

                s_even = df._id.iloc[1::2]
                s_odd = df._id.iloc[::2]
                df_odd = df[df.index.isin(s_odd)].copy()
                df_even = df[df.index.isin(s_even)].copy()
                df_even['_id2'] = df_even['_id']
                df_even['name2'] = df_even['name']
                df_even = df_even[['_id2', 'name2']]
                df_odd = df_odd[['_id', 'name']]
                df_odd.index = range(0, len(df_odd))
                df_even.index = range(0, len(df_even))
                df_result = pd.concat([df_odd, df_even],
                                      axis=1).reindex(df_odd.index)
                saved_copy = df_result[['_id', '_id2']].to_dict('records')
                df_result = df_result[['name', 'name2']]
                df_result.columns = ['file1', 'file2']

                stage_data = df_result.to_dict('records')

            stage["data"] = stage_data

            # save state
            attributes[stage["ref"]] = saved_copy
            meta[stage["ref"] + "_paired_candidates"] = paired_candidates

            save_dict = dict(attributes=attributes, meta=meta)
            Description().edit_description(self.__wzh.description_token,
                                           save_dict)

            stage["message"] = self.__wzh.wiz_message[
                "datafiles_pairing_message"]["text"]

        return stage