Ejemplo n.º 1
0
    def get_lookup_type(self):
        projection = dict(label=1, accession=1, description=1)
        filter_by = dict(type=self.data_source)

        records = list()

        if self.accession or self.search_term:
            if self.accession:
                bn = list()
                bn.append(self.accession) if isinstance(
                    self.accession, str) else bn.extend(self.accession)
                filter_by["accession"] = {'$in': bn}
            elif self.search_term:
                filter_by["label"] = {
                    '$regex': self.search_term,
                    "$options": 'i'
                }

            records = cursor_to_list(Lookups.find(filter_by, projection))

            if not records and self.search_term:
                del filter_by["label"]
                records = cursor_to_list(Lookups.find(filter_by, projection))

        return records
Ejemplo n.º 2
0
    def get_all_records(self, sort_by='_id', sort_direction=-1, **kwargs):
        doc = dict(deleted=data_utils.get_not_deleted_flag())
        if self.profile_id:
            doc["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(doc).sort(
            [[sort_by, sort_direction]]))
Ejemplo n.º 3
0
    def get_all_records_columns_server(self,
                                       sort_by='_id',
                                       sort_direction=-1,
                                       projection=dict(),
                                       filter_by=dict(),
                                       search_term=str(),
                                       limit=0,
                                       skip=0):

        filter_by["deleted"] = data_utils.get_not_deleted_flag()

        # 'name' seems to be the only reasonable field to restrict searching; others fields are resolved
        filter_by["name"] = {'$regex': search_term, "$options": 'i'}

        if self.profile_id:
            filter_by["profile_id"] = self.profile_id

        if skip > 0:
            records = self.get_collection_handle().find(
                filter_by,
                projection).sort([[sort_by,
                                   sort_direction]]).skip(skip).limit(limit)
        else:
            records = self.get_collection_handle().find(
                filter_by, projection).sort([[sort_by,
                                              sort_direction]]).limit(limit)

        return cursor_to_list(records)
Ejemplo n.º 4
0
 def get_all_records_columns(self,
                             sort_by='_id',
                             sort_direction=-1,
                             projection=dict(),
                             filter_by=dict()):
     return cursor_to_list(
         self.DescriptionCollection.find(filter_by, projection).sort(
             [[sort_by, sort_direction]]))
Ejemplo n.º 5
0
 def get_repos_for_group_info(self, uid, group_id):
     g = CopoGroup().get_record(ObjectId(group_id))
     docs = cursor_to_list(Repository().Repository.find({'users.uid': uid}))
     for d in docs:
         if d['_id'] in g['repo_ids']:
             d['selected'] = True
         else:
             d['selected'] = False
     return list(docs)
Ejemplo n.º 6
0
 def get_profiles_for_group_info(self, group_id):
     p_list = cursor_to_list(Profile().get_for_user(
         data_utils.get_user_id()))
     group = CopoGroup().get_record(ObjectId(group_id))
     for p in p_list:
         if p['_id'] in group['shared_profile_ids']:
             p['selected'] = True
         else:
             p['selected'] = False
     return p_list
Ejemplo n.º 7
0
    def get_all_records_columns(self,
                                sort_by='_id',
                                sort_direction=-1,
                                projection=dict(),
                                filter_by=dict()):
        filter_by["deleted"] = data_utils.get_not_deleted_flag()
        if self.profile_id:
            filter_by["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(
            filter_by, projection).sort([[sort_by, sort_direction]]))
Ejemplo n.º 8
0
    def set_targets_datafiles(self):
        targets_datafiles = dict()

        object_list = list()
        for target in self.description_targets:
            object_list.append(ObjectId(target["recordID"]))

        datafiles = cursor_to_list(DataFile().get_collection_handle().find({"_id": {"$in": object_list}}))

        for df in datafiles:
            targets_datafiles[str(df["_id"])] = df

        return targets_datafiles
Ejemplo n.º 9
0
 def get_terms_for_user_by_dataset(self, uid):
     docs = self.get_collection_handle().aggregate([{
         "$match": {
             "uid": uid
         }
     }, {
         "$group": {
             "_id": "$file_id",
             "annotations": {
                 "$push": "$$ROOT"
             }
         }
     }])
     data = cursor_to_list(docs)
     return data
Ejemplo n.º 10
0
    def get_elapsed_time_dataframe(self):
        pipeline = [{
            "$project": {
                "_id": 1,
                "diff_days": {
                    "$divide": [{
                        "$subtract":
                        [data_utils.get_datetime(), "$created_on"]
                    }, 1000 * 60 * 60 * 24]
                }
            }
        }]
        description_df = pd.DataFrame(
            cursor_to_list(self.DescriptionCollection.aggregate(pipeline)))

        return description_df
Ejemplo n.º 11
0
    def get_file_level_metadata_for_sheet(self, file_id, sheetname):

        docs = self.get_collection_handle().aggregate([{
            "$match": {
                "_id": ObjectId(file_id)
            }
        }, {
            "$unwind":
            "$file_level_annotation"
        }, {
            "$match": {
                "file_level_annotation.sheet_name": sheetname
            }
        }, {
            "$project": {
                "file_level_annotation": 1,
                "_id": 0
            }
        }, {
            "$sort": {
                "file_level_annotation.column_idx": 1
            }
        }])
        return cursor_to_list(docs)
Ejemplo n.º 12
0
 def get_all_samples(self):
     doc = {'profile_id': self.profile_id}
     return cursor_to_list(self.SampleCollection.find(doc))
Ejemplo n.º 13
0
 def get_all_people(self):
     doc = {'profile_id': self.profile_id, 'deleted': "0"}
     return cursor_to_list(self.PersonCollection.find(doc))
Ejemplo n.º 14
0
    def get_sra_samples(self, submission_location=str()):
        """
        function retrieves study samples and presents them in a format for building an sra sample set
        :param submission_location:
        :return:
        """

        sra_samples = list()

        # get datafiles
        datafiles = cursor_to_list(ghlper.get_datafiles_handle().find(
            {
                "description_token": self.description_token,
                'deleted': data_utils.get_not_deleted_flag()
            }, {
                '_id': 1,
                'file_location': 1,
                "description.attributes": 1,
                "name": 1,
                "file_hash": 1
            }))

        if not len(datafiles):
            self.__converter_errors.append("No datafiles found in submission!")
            return sra_samples

        df = pd.DataFrame(datafiles)
        df['file_id'] = df._id.astype(str)
        df['file_path'] = df['file_location'].fillna('')
        df['upload_status'] = False

        df = df[['file_id', 'file_path', 'upload_status']]
        bundle = list(df.file_id)
        bundle_meta = df.to_dict('records')

        submission_record = dict(bundle=bundle, bundle_meta=bundle_meta)

        ghlper.get_submission_handle().update(
            {"_id": ObjectId(self.submission_id)}, {'$set': submission_record})

        samples_id = list()
        df_attributes = []  # datafiles attributes

        for datafile in datafiles:
            datafile_attributes = [
                v for k, v in datafile.get("description", dict()).get(
                    "attributes", dict()).items()
            ]
            new_dict = dict()
            for d in datafile_attributes:
                new_dict.update(d)

            new_dict['datafile_id'] = str(datafile['_id'])
            new_dict['datafile_name'] = datafile.get('name', str())
            new_dict['datafile_hash'] = datafile.get('file_hash', str())
            new_dict['datafile_location'] = datafile.get(
                'file_location', str())

            df_attributes.append(new_dict)

        # process datafiles attributes
        df_attributes_df = pd.DataFrame(df_attributes)
        df_columns = df_attributes_df.columns

        # replace null values
        for k in df_columns:
            df_attributes_df[k].fillna('', inplace=True)

        if 'study_samples' in df_columns:
            df_attributes_df['study_samples'] = df_attributes_df[
                'study_samples'].apply(lambda x: x[0] if isinstance(x, list)
                                       else x.split(",")[-1])
            samples_id = list(df_attributes_df['study_samples'].unique())
            samples_id = [x for x in samples_id if x]

        if not samples_id:
            self.__converter_errors.append(
                "No samples associated with datafiles!")
            return sra_samples

        file_path = os.path.join(submission_location, "datafiles.csv")
        df_attributes_df.to_csv(path_or_buf=file_path, index=False)

        samples_id_object_list = [
            ObjectId(sample_id) for sample_id in samples_id
        ]

        sample_records = ghlper.get_samples_handle().find(
            {"_id": {
                "$in": samples_id_object_list
            }})

        # get sources
        sources = ghlper.get_sources_handle().find({
            "profile_id":
            self.profile_id,
            'deleted':
            data_utils.get_not_deleted_flag()
        })

        sra_sources = dict()

        for source in sources:
            sra_source = dict()
            sra_sources[str(source["_id"])] = sra_source

            sra_source["name"] = source["name"]
            sra_source["taxon_id"] = source.get("organism", dict()).get(
                'termAccession', str())
            if 'NCBITaxon_' in sra_source["taxon_id"]:
                sra_source["taxon_id"] = sra_source["taxon_id"].split(
                    'NCBITaxon_')[-1]

            sra_source["scientific_name"] = source.get("organism", dict()).get(
                'annotationValue', str())
            sra_source['attributes'] = self.get_attributes(
                source.get("characteristics", list()))
            sra_source[
                'attributes'] = sra_source['attributes'] + self.get_attributes(
                    source.get("factorValues", list()))

        for sample in sample_records:
            sra_sample = dict()
            sra_sample['sample_id'] = str(sample['_id'])
            sra_sample['name'] = sample['name']
            sra_sample['attributes'] = self.get_attributes(
                sample.get("characteristics", list()))
            sra_sample[
                'attributes'] = sra_sample['attributes'] + self.get_attributes(
                    sample.get("factorValues", list()))

            # retrieve sample source
            source_id = sample.get("derivesFrom", list())
            source_id = source_id[0] if source_id else ''
            sample_source = sra_sources.get(source_id, dict())

            if sample_source:
                sra_sample['attributes'].append(
                    dict(tag="Source Name",
                         value=sample_source.get("name", str())))
            else:
                self.__converter_errors.append("Sample: " + sample['name'] +
                                               " has no source information")

            if sample_source.get("taxon_id", str()):
                sra_sample['taxon_id'] = sample_source.get("taxon_id", str())
            else:
                self.__converter_errors.append(
                    "Sample: " + sample['name'] +
                    " has no TAXON_ID. Please make sure an organism has "
                    "been set for the source of this sample from the NCBITAXON ontology."
                )

            if sample_source.get("scientific_name", str()):
                sra_sample['scientific_name'] = sample_source.get(
                    "scientific_name", str())
            else:
                self.__converter_errors.append(
                    "Sample: " + sample['name'] +
                    " has no SCIENTIFIC_NAME. Please make sure an organism has "
                    "been set for the source of this sample from an ontology.")

            if sample_source.get("attributes", list()):
                sra_sample['attributes'] = sra_sample[
                    'attributes'] + sample_source.get("attributes", list())

            sra_samples.append(sra_sample)

        return sra_samples
Ejemplo n.º 15
0
 def get_by_ids(self, uids):
     doc = list()
     if (uids):
         oids = list(map(lambda x: ObjectId(x), uids))
         doc = self.get_collection_handle().find({"_id": {"$in": oids}})
     return cursor_to_list(doc)
Ejemplo n.º 16
0
    def get_allsamples(self):
        """
        lookup for all samples irrespective of sample type
        :return:
        """

        import web.apps.web_copo.templatetags.html_tags as htags

        df = pd.DataFrame()

        if self.accession:
            if isinstance(self.accession, str):
                self.accession = self.accession.split(",")

            object_ids = [ObjectId(x) for x in self.accession if x.strip()]
            records = cursor_to_list(Sample().get_collection_handle().find(
                {"_id": {
                    "$in": object_ids
                }}))

            if records:
                df = pd.DataFrame(records)
                df['accession'] = df._id.astype(str)
                df['label'] = df['name']
                df['desc'] = df['accession'].apply(
                    lambda x: htags.generate_attributes("sample", x))
                df['description'] = df['desc'].apply(
                    lambda x: self.format_description(x))
                df['server-side'] = True  # ...to request callback to server for resolving item description
        elif self.search_term:
            projection = dict(name=1)
            filter_by = dict()
            filter_by["name"] = {'$regex': self.search_term, "$options": 'i'}

            sort_by = 'name'
            sort_direction = -1

            records = Sample(
                profile_id=self.profile_id).get_all_records_columns(
                    filter_by=filter_by,
                    projection=projection,
                    sort_by=sort_by,
                    sort_direction=sort_direction)
            if not records:
                # try getting all records
                del filter_by['name']
                records = Sample(
                    profile_id=self.profile_id).get_all_records_columns(
                        filter_by=filter_by,
                        projection=projection,
                        sort_by=sort_by,
                        sort_direction=sort_direction)

            if records:
                df = pd.DataFrame(records)
                df['accession'] = df._id.astype(str)
                df['label'] = df['name']
                df['description'] = ''
                df['server-side'] = True  # ...to request callback to server for resolving item description

        result = list()

        if not df.empty:
            df = df[['accession', 'label', 'description', 'server-side']]
            result = df.to_dict('records')

        return result
Ejemplo n.º 17
0
    def cg_dependency_lookup(self):
        """
        lookup for cgcore dependent components
        :return:
        """
        import web.apps.web_copo.templatetags.html_tags as htags

        result = list()
        df = pd.DataFrame()
        dependent_record_label = 'copo_name'

        if self.accession:
            if isinstance(self.accession, str):
                self.accession = self.accession.split(",")

            object_ids = [ObjectId(x) for x in self.accession if x.strip()]
            records = cursor_to_list(CGCore().get_collection_handle().find(
                {"_id": {
                    "$in": object_ids
                }}))
            result = list()

            if records:
                for record in records:
                    referenced_field = record.get("dependency_id", str())
                    kwargs = dict()
                    kwargs["referenced_field"] = referenced_field
                    schema = CGCore().get_component_schema(**kwargs)

                    label = record.get(dependent_record_label, str())

                    # modify schema before generating description
                    schema = [
                        x for x in schema
                        if 'dependency' in x and x['dependency'] ==
                        referenced_field and x.get("show_in_table", True)
                    ]
                    resolved = htags.resolve_display_data(schema, record)
                    description = self.format_description(resolved)

                    item_dict = dict(accession=str(record["_id"]),
                                     label=label,
                                     description=description)
                    item_dict[
                        'server-side'] = True  # ...to request callback to server for resolving item description

                    result.append(item_dict)
        elif self.search_term:
            referenced_field = self.referenced_field
            filter_name = dependent_record_label
            projection = {filter_name: 1}
            filter_by = dict(dependency_id=referenced_field)
            filter_by[filter_name] = {
                '$regex': self.search_term,
                "$options": 'i'
            }

            sort_by = filter_name
            sort_direction = -1

            records = CGCore(
                profile_id=self.profile_id).get_all_records_columns(
                    filter_by=filter_by,
                    projection=projection,
                    sort_by=sort_by,
                    sort_direction=sort_direction)

            if not records:
                # try getting all records
                del filter_by[filter_name]
                records = CGCore(
                    profile_id=self.profile_id).get_all_records_columns(
                        filter_by=filter_by,
                        projection=projection,
                        sort_by=sort_by,
                        sort_direction=sort_direction)
            if records:
                df = pd.DataFrame(records)
                df['accession'] = df._id.astype(str)
                df['label'] = df[filter_name]
                df['description'] = ''
                df['server-side'] = True  # ...to request callback to server for resolving item description

        if not df.empty:
            df = df[['accession', 'label', 'description', 'server-side']]
            result = df.to_dict('records')

        return result
Ejemplo n.º 18
0
 def get_all_records(self):
     doc = {
         'profile_id': self.profile_id,
         'deleted': data_utils.get_not_deleted_flag()
     }
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Ejemplo n.º 19
0
def view_groups(request):
    # g = Group().create_group(description="test descrition")
    profile_list = cursor_to_list(Profile().get_for_user())
    group_list = cursor_to_list(CopoGroup().get_by_owner(request.user.id))
    return render(request, 'copo/copo_group.html',
                  {'request': request, 'profile_list': profile_list, 'group_list': group_list})
Ejemplo n.º 20
0
 def get_by_datafile(self, datafile_id):
     doc = {'datafile_id': ObjectId(datafile_id), 'deleted': data_utils.get_not_deleted_flag()}
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Ejemplo n.º 21
0
    def execute_query(self, query_dict=dict()):
        if self.profile_id:
            query_dict["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(query_dict))
Ejemplo n.º 22
0
    def extract_repo_fields(self, datafile_id=str(), repo=str()):
        """
        given a datafile id, and repository type function returns a list of dictionaries of fields matching the repo
        :param datafile_id:
        :param repo:
        :return:
        """

        from dal.copo_da import DataFile, CGCore
        from dal.copo_base_da import DataSchemas

        if not repo:  # no repository to filter by
            return list()

        repo_type_option = lkup.DROP_DOWNS["REPO_TYPE_OPTIONS"]
        repo_type_option = [
            x for x in repo_type_option if x["value"].lower() == repo.lower()
        ]

        if not repo_type_option:
            return list()

        repo_type_option = repo_type_option[0]

        cg_schema = DataSchemas("COPO").get_ui_template_node('cgCore')

        # filter schema items by repo
        cg_schema = [
            x for x in cg_schema
            if x.get("target_repo", str()).strip() != str()
            and repo_type_option.get("abbreviation", str()) in
            [y.strip() for y in x.get("target_repo").split(',')]
        ]

        record = DataFile().get_record(datafile_id)
        description = record.get("description", dict())

        attributes = description.get("attributes", dict())
        stages = description.get("stages", list())

        schema_df = pd.DataFrame(cg_schema)
        schema_df.id = schema_df.id.str.lower().str.split(".").str[-1]
        schema_df.index = schema_df.id
        schema_df = schema_df[['ref', 'id', 'prefix']]
        schema_df = schema_df[~schema_df['ref'].isna()]

        # get all stage items
        all_items = [item for st in stages for item in st.get("items", list())]

        # filter stage items - stage items should conform to specifications of the repo
        schema_ids = list(schema_df.id)
        items = {
            item.get("id", str()).lower().split(".")[-1]: st.get("ref",
                                                                 "").lower()
            for st in stages for item in st.get("items", list())
            if item.get("id", str()).lower().split(".")[-1] in schema_ids
        }

        # ...also, account for any filtering performed by client agents (e.g., dependencies in COPO Wizard),
        # within the context of the target repo
        schema_df = schema_df[schema_df.index.isin(items.keys())]

        # obtain attributes for filtered stage items
        target_stages = list(set(items.values()))
        datafile_attributes = [
            v for k, v in attributes.items() if k in target_stages
        ]

        new_dict = dict()
        for d in datafile_attributes:
            new_dict.update(d)

        new_dict_series = pd.Series(new_dict)
        new_dict_series.index = new_dict_series.index.str.lower()
        schema_df['vals'] = new_dict_series
        schema_df['vals'] = schema_df['vals'].fillna('')

        schema_df = schema_df[['ref', 'id', 'vals', 'prefix']]

        # get composite attributes
        composite_attrib = [
            x for x in all_items if x["id"] in list(schema_df.id)
            and x.get("create_new_item", False)
        ]

        # expand composite attributes
        for cattrib in composite_attrib:
            comp_series = schema_df.loc[cattrib["id"]]
            schema_df = schema_df[~schema_df.id.isin([cattrib["id"]])]
            children_schemas = [
                x for x in cg_schema if x.get("dependency", str()).lower() ==
                comp_series.ref.lower()
            ]

            accessions = comp_series.vals
            if isinstance(accessions, str):
                accessions = accessions.split(",")

            object_ids = [ObjectId(x) for x in accessions if x.strip()]

            records = list()
            if len(object_ids):
                records = cursor_to_list(CGCore().get_collection_handle().find(
                    {"_id": {
                        "$in": object_ids
                    }}))

            attr_list = list()
            for child in children_schemas:
                child_dict = dict(ref=child["ref"],
                                  id=child["id"].split(".")[-1],
                                  prefix=child["prefix"],
                                  vals=[])
                attr_list.append(child_dict)
                for rec in records:
                    child_dict["vals"].append(rec.get(child_dict["id"], str()))

            if attr_list:
                attr_df = pd.DataFrame(attr_list)
                attr_df.index = attr_df.id
                schema_df = pd.concat([schema_df, attr_df])

        schema_df.rename(index=str,
                         columns={
                             "ref": "dc",
                             "id": "copo_id"
                         },
                         inplace=True)

        dc_list = schema_df.to_dict('records')

        return dc_list
Ejemplo n.º 23
0
 def get_all_datafiles(self):
     doc = {'profile_id': self.profile_id, 'deleted': "0"}
     return cursor_to_list(self.DataFileCollection.find(doc))
Ejemplo n.º 24
0
 def get_by_datafile(self, datafile_id):
     doc = {
         'datafile_id': ObjectId(datafile_id),
         'deleted': data_utils.get_not_deleted_flag()
     }
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Ejemplo n.º 25
0
 def get_all_descriptions(self):
     return cursor_to_list(self.DescriptionCollection.find())
Ejemplo n.º 26
0
 def get_all_publications(self):
     doc = {'profile_id': self.profile_id, 'deleted': "0"}
     return cursor_to_list(self.PubCollection.find(doc))
Ejemplo n.º 27
0
    def execute_query(self, query_dict=dict()):
        if self.profile_id:
            query_dict["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(query_dict))
Ejemplo n.º 28
0
    def get_all_records(self, sort_by='_id', sort_direction=-1):
        doc = dict(deleted=data_utils.get_not_deleted_flag())
        if self.profile_id:
            doc["profile_id"] = self.profile_id

        return cursor_to_list(self.get_collection_handle().find(doc).sort([[sort_by, sort_direction]]))
Ejemplo n.º 29
0
 def get_terms_for_user_alphabetical(self, uid):
     a = self.get_collection_handle().find({
         "uid": uid
     }).sort("label", pymongo.ASCENDING)
     return cursor_to_list(a)
Ejemplo n.º 30
0
 def get_all_records(self):
     doc = {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()}
     return cursor_to_list(self.RemoteFileCollection.find(doc))
Ejemplo n.º 31
0
 def get_terms_for_user_ranked(self, uid):
     a = self.get_collection_handle().find({
         "uid": uid
     }).sort("count", pymongo.DESCENDING)
     return cursor_to_list(a)
Ejemplo n.º 32
0
 def get_all_descriptions(self):
     return cursor_to_list(self.DescriptionCollection.find())
Ejemplo n.º 33
0
    def perform_datafile_pairing(self, next_stage_index):
        """
        stage callback function: determines if the pairing of datafiles should be performed given the 'library_layout'
        :param next_stage_index:
        :return:
        """

        description = Description().GET(self.__wzh.description_token)
        stages = description["stages"]
        attributes = description["attributes"]
        meta = description.get("meta", dict())

        # validate stage
        stage = dict()

        if next_stage_index < len(stages):
            stage = stages[next_stage_index]

        # first, target repository
        relevant_repos = [
            "ena"
        ]  # add a repo to this list if it requires datafile pairing

        target_repository = attributes.get("target_repository",
                                           dict()).get("deposition_context",
                                                       str())

        if target_repository not in relevant_repos:
            # no items to pair, clear any previous pairing information
            self.remove_pairing_info(stage["ref"], attributes, meta)

            return False

        # get records in bundle
        records = cursor_to_list(DataFile().get_collection_handle().find(
            {
                "$and": [{
                    "description_token": self.__wzh.description_token,
                    'deleted': d_utils.get_not_deleted_flag()
                }, {
                    'description.attributes': {
                        "$exists": True
                    }
                }]
            }, {
                'description.attributes': 1,
                'name': 1
            }))

        if not records:
            # no items to pair, clear any previous pairing information
            self.remove_pairing_info(stage["ref"], attributes, meta)

            return False

        for rec in records:
            datafile_attributes = [
                v for k, v in rec['description'].get('attributes',
                                                     dict()).items()
            ]

            new_dict = dict()
            for d in datafile_attributes:
                new_dict.update(d)

            rec['attributes'] = new_dict
            rec['pairing'] = rec['attributes'].get('library_layout',
                                                   '').upper()

        df = pd.DataFrame(records)
        df._id = df['_id'].astype(str)
        df.index = df._id

        df = df[df['pairing'] == 'PAIRED']

        if not len(df):
            # no items to pair, clear any previous pairing information
            self.remove_pairing_info(stage["ref"], attributes, meta)

            return False

        # remove extraneous columns
        df = df.drop(columns=['description'])

        if not len(df) % 2 == 0:
            stage["error"] = "Pairing requires even number of datafiles!"
            stage["refresh_wizard"] = True
        else:
            # get previously pairing candidates
            paired_candidates_old = meta.get(
                stage["ref"] + "_paired_candidates", list())
            paired_candidates = list(df.index)

            paired_candidates_old.sort()
            paired_candidates.sort()

            if not paired_candidates_old == paired_candidates:
                stage["refresh_wizard"] = True

            # if there's a valid stored map, use it
            stage_data = list()
            saved_copy = attributes.get(stage["ref"], list())

            if saved_copy:
                stored_pairs_df = pd.DataFrame(saved_copy)
                stored_pairs_list = list(stored_pairs_df._id) + list(
                    stored_pairs_df._id2)
                stored_pairs_list.sort()

                if stored_pairs_list == paired_candidates:
                    df_dict = df.to_dict()
                    df_dict = df_dict["name"]

                    stored_pairs_df["name"] = stored_pairs_df['_id'].apply(
                        lambda x: str(df_dict[x]))
                    stored_pairs_df["name2"] = stored_pairs_df['_id2'].apply(
                        lambda x: str(df_dict[x]))

                    df_result = stored_pairs_df[['name', 'name2']]
                    df_result.columns = ['file1', 'file2']

                    stage_data = df_result.to_dict('records')

            if not stage_data:
                # define fresh pairing map

                # sort by file name to reflect pairing
                df = df.sort_values(by=['name'])

                s_even = df._id.iloc[1::2]
                s_odd = df._id.iloc[::2]
                df_odd = df[df.index.isin(s_odd)].copy()
                df_even = df[df.index.isin(s_even)].copy()
                df_even['_id2'] = df_even['_id']
                df_even['name2'] = df_even['name']
                df_even = df_even[['_id2', 'name2']]
                df_odd = df_odd[['_id', 'name']]
                df_odd.index = range(0, len(df_odd))
                df_even.index = range(0, len(df_even))
                df_result = pd.concat([df_odd, df_even],
                                      axis=1).reindex(df_odd.index)
                saved_copy = df_result[['_id', '_id2']].to_dict('records')
                df_result = df_result[['name', 'name2']]
                df_result.columns = ['file1', 'file2']

                stage_data = df_result.to_dict('records')

            stage["data"] = stage_data

            # save state
            attributes[stage["ref"]] = saved_copy
            meta[stage["ref"] + "_paired_candidates"] = paired_candidates

            save_dict = dict(attributes=attributes, meta=meta)
            Description().edit_description(self.__wzh.description_token,
                                           save_dict)

            stage["message"] = self.__wzh.wiz_message[
                "datafiles_pairing_message"]["text"]

        return stage