Example #1
0
    def inherit_metadata(self, reference_target_id):
        """
        using reference_target as the basis, copy metadata across to description targets
        :param reference_target_id:
        :return:
        """

        reference_description = DataFile().get_record(reference_target_id).get("description", dict())

        reference_attributes = reference_description.get("attributes", dict())
        reference_stages = reference_description.get("stages", list())

        for target in self.description_targets:
            # 'focus' on target
            self.set_datafile_id(target["recordID"])

            # use batch stages to update targets
            self.update_datafile_stage(reference_stages)

            # find and attributes from the reference
            for k, v in reference_attributes.items():
                if k not in self.get_datafile_attributes():
                    self.update_datafile_attributes({'ref': k, 'data': v})

        self.update_targets_datafiles()
        return
Example #2
0
    def submit(self, sub_id, dataFile_ids):

        # physically transfer files
        path2library = os.path.join(BASE_DIR, REPOSITORIES['ASPERA']['resource_path'])

        # change these to be collected properly
        user_name = REPOSITORIES['ASPERA']['user_token']
        password = REPOSITORIES['ASPERA']['password']

        # create transfer record
        transfer_token = RemoteDataFile().create_transfer(sub_id)['_id']
        self.submission = Submission().get_record(sub_id)

        self.profile = Profile().get_record(self.submission['profile_id'])
        remote_path = d_utils.get_ena_remote_path(sub_id)

        # get each file in the bundle
        file_path = []
        for idx, f_id in enumerate(dataFile_ids):
            mongo_file = DataFile().get_record(f_id)
            self.d_files.append(mongo_file)
            file_path.append(mongo_file.get("file_location", str()))

        self._do_aspera_transfer(transfer_token=transfer_token,
                                 user_name=user_name,
                                 password=password,
                                 remote_path=remote_path,
                                 file_path=file_path,
                                 path2library=path2library,
                                 sub_id=sub_id)
Example #3
0
def hash_upload(request):
    # utility method to create an md5 hash of a given file path
    # open uploaded file
    file_id = request.GET['file_id']
    print('hash started ' + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)
    file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name)

    # now hash opened file
    md5 = hashlib.md5()
    with open(file_name, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            md5.update(chunk)

    file_obj.hash = md5.hexdigest()
    file_obj.save()

    output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id}

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash

    profile_id = request.session['profile_id']
    component = "datafile"

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component=component,
             auto_fields=auto_fields).do_save_edit()

    out = jsonpickle.encode(output_dict)
    print('hash complete ' + file_id)
    return HttpResponse(out, content_type='json')
Example #4
0
    def do_sanitise_submissions(self):

        records = self.da_object.get_all_records()

        for submission in records:
            if "bundle_meta" not in submission:
                bundle_meta = list()

                for file_id in submission.get("bundle", list()):
                    datafile = DataFile().get_record(file_id)
                    if datafile:
                        upload_status = False

                        if str(submission.get("complete",
                                              False)).lower() == 'true':
                            upload_status = True
                        bundle_meta.append(
                            dict(file_id=file_id,
                                 file_path=datafile.get(
                                     "file_location", str()),
                                 upload_status=upload_status))
                submission["bundle_meta"] = bundle_meta
                submission['target_id'] = str(submission.pop('_id'))
                self.da_object.save_record(dict(), **submission)

        self.context["sanitise_status"] = True

        return self.context
Example #5
0
def hash_upload(request):
    # utility method to create an md5 hash of a given file path
    # open uploaded file
    file_id = request.GET['file_id']
    print('hash started ' + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)
    file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name)

    # now hash opened file
    md5 = hashlib.md5()
    with open(file_name, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            md5.update(chunk)

    file_obj.hash = md5.hexdigest()
    file_obj.save()

    output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id}

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component="datafile",
             auto_fields=auto_fields
             ).do_save_edit()

    out = jsonpickle.encode(output_dict)
    print('hash complete ' + file_id)
    return HttpResponse(out, content_type='json')
Example #6
0
    def do_description_summary(self):
        record = DataFile().get_record(self.param_dict.get("target_id"))
        self.context['description'] = htags.resolve_description_data(
            record.get("description", dict()), dict())

        description_token = record.get('description_token', str())
        self.context['description']['description_record'] = dict()

        if description_token:
            description_record = Description().GET(description_token)
            if description_record:
                if not description_record["name"]:
                    description_record["name"] = "N/A"
                self.context['description']['description_record'] = dict(
                    name=description_record["name"],
                    id=str(description_record["_id"]))

        return self.context
Example #7
0
def zip_file(request):
    # need to get a reference to the file to zip
    file_id = request.GET['file_id']
    print("zip started " + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)

    # get the name of the file to zip and change its suffix to .gz
    output_file_location = os.path.join(settings.MEDIA_ROOT, file_obj.file.name)
    output_file_name = file_obj.filename + '.gz'
    try:
        # open the file as gzip acrchive...set compression level
        temp_name = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()) + '.tmp')
        myzip = gzip.open(temp_name, 'wb', compresslevel=1)
        src = open(output_file_location, 'r')

        # write input file to gzip archive in n byte chunks
        n = 100000000
        for chunk in iter(lambda: src.read(n), ''):
            myzip.write(bytes(chunk, 'UTF-8'))
    finally:
        myzip.close()
        src.close()

    print('zip complete ' + file_id)
    # now need to delete the old file and update the file record with the new file
    new_file_name = output_file_location + '.gz'
    os.rename(temp_name, new_file_name)
    os.remove(output_file_location)

    # calculate new file size
    stats = os.stat(new_file_name)
    new_file_size = stats.st_size / 1000 / 1000

    # update filename
    file_obj.filename = output_file_name
    file_obj.file.name = new_file_name

    # update file size
    file_obj.offset = stats.st_size
    file_obj.save()

    out = {'zipped': True, 'file_name': output_file_name, 'file_size': new_file_size}

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field("file_size")] = u.filesize_toString(file_obj.offset)
    auto_fields[DataFile().get_qualified_field("name")] = output_file_name
    auto_fields[DataFile().get_qualified_field("file_location")] = new_file_name

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component="datafile",
             auto_fields=auto_fields
             ).do_save_edit()

    out = jsonpickle.encode(out)
    return HttpResponse(out, content_type='text/plain')
Example #8
0
    def submit(self, sub_id, dataFile_ids):
        submission_record = Submission().get_record(sub_id)

        # bundle_meta, if present, should provide a better picture of what datafiles need to be uploaded
        if "bundle_meta" in submission_record:
            pending_files = [
                x["file_id"] for x in submission_record['bundle_meta']
                if not x["upload_status"]
            ]
            dataFile_ids = pending_files

        # physically transfer files
        path2library = os.path.join(BASE_DIR,
                                    REPOSITORIES['ASPERA']['resource_path'])

        # change these to be collected properly
        user_name = REPOSITORIES['ASPERA']['user_token']
        password = REPOSITORIES['ASPERA']['password']

        # create transfer record
        transfer_token = RemoteDataFile().create_transfer(sub_id)['_id']
        self.submission = Submission().get_record(sub_id)

        self.profile = Profile().get_record(self.submission['profile_id'])
        remote_path = d_utils.get_ena_remote_path(sub_id)

        # get each file in the bundle
        file_path = []
        for idx, f_id in enumerate(dataFile_ids):
            mongo_file = DataFile().get_record(ObjectId(f_id))
            self.d_files.append(mongo_file)
            file_path.append(mongo_file.get("file_location", str()))

        case = self._do_aspera_transfer(transfer_token=transfer_token,
                                        user_name=user_name,
                                        password=password,
                                        remote_path=remote_path,
                                        file_path=file_path,
                                        path2library=path2library,
                                        sub_id=sub_id)
        return case
Example #9
0
    def resolve_deposition_context(self):
        """
        this returns an inferred deposition destination for a datafile.
        we assume here that the target destination of the file can be inferred based on its type
        :param:
        :return string destination:
        """

        # get file details
        datafile = DataFile().get_record(self.datafile_id)
        ft = datafile.get("file_type", "unknown")

        if ft == '':
            ft = 'unknown'

        deposition_context = 'default'

        # match against documented destinations
        for k, v in lkup.REPO_FILE_EXTENSIONS.items():
            if ft in v:
                deposition_context = k
                break

        return deposition_context
Example #10
0
    def extract_repo_fields(self, datafile_id=str(), repo=str()):
        """
        given a datafile id, and repository type function returns a list of dictionaries of fields matching the repo
        :param datafile_id:
        :param repo:
        :return:
        """

        from dal.copo_da import DataFile, CGCore
        from dal.copo_base_da import DataSchemas

        if not repo:  # no repository to filter by
            return list()

        repo_type_option = lkup.DROP_DOWNS["REPO_TYPE_OPTIONS"]
        repo_type_option = [
            x for x in repo_type_option if x["value"].lower() == repo.lower()
        ]

        if not repo_type_option:
            return list()

        repo_type_option = repo_type_option[0]

        cg_schema = DataSchemas("COPO").get_ui_template_node('cgCore')

        # filter schema items by repo
        cg_schema = [
            x for x in cg_schema
            if x.get("target_repo", str()).strip() != str()
            and repo_type_option.get("abbreviation", str()) in
            [y.strip() for y in x.get("target_repo").split(',')]
        ]

        record = DataFile().get_record(datafile_id)
        description = record.get("description", dict())

        attributes = description.get("attributes", dict())
        stages = description.get("stages", list())

        schema_df = pd.DataFrame(cg_schema)
        schema_df.id = schema_df.id.str.lower().str.split(".").str[-1]
        schema_df.index = schema_df.id
        schema_df = schema_df[['ref', 'id', 'prefix']]
        schema_df = schema_df[~schema_df['ref'].isna()]

        # get all stage items
        all_items = [item for st in stages for item in st.get("items", list())]

        # filter stage items - stage items should conform to specifications of the repo
        schema_ids = list(schema_df.id)
        items = {
            item.get("id", str()).lower().split(".")[-1]: st.get("ref",
                                                                 "").lower()
            for st in stages for item in st.get("items", list())
            if item.get("id", str()).lower().split(".")[-1] in schema_ids
        }

        # ...also, account for any filtering performed by client agents (e.g., dependencies in COPO Wizard),
        # within the context of the target repo
        schema_df = schema_df[schema_df.index.isin(items.keys())]

        # obtain attributes for filtered stage items
        target_stages = list(set(items.values()))
        datafile_attributes = [
            v for k, v in attributes.items() if k in target_stages
        ]

        new_dict = dict()
        for d in datafile_attributes:
            new_dict.update(d)

        new_dict_series = pd.Series(new_dict)
        new_dict_series.index = new_dict_series.index.str.lower()
        schema_df['vals'] = new_dict_series
        schema_df['vals'] = schema_df['vals'].fillna('')

        schema_df = schema_df[['ref', 'id', 'vals', 'prefix']]

        # get composite attributes
        composite_attrib = [
            x for x in all_items if x["id"] in list(schema_df.id)
            and x.get("create_new_item", False)
        ]

        # expand composite attributes
        for cattrib in composite_attrib:
            comp_series = schema_df.loc[cattrib["id"]]
            schema_df = schema_df[~schema_df.id.isin([cattrib["id"]])]
            children_schemas = [
                x for x in cg_schema if x.get("dependency", str()).lower() ==
                comp_series.ref.lower()
            ]

            accessions = comp_series.vals
            if isinstance(accessions, str):
                accessions = accessions.split(",")

            object_ids = [ObjectId(x) for x in accessions if x.strip()]

            records = list()
            if len(object_ids):
                records = cursor_to_list(CGCore().get_collection_handle().find(
                    {"_id": {
                        "$in": object_ids
                    }}))

            attr_list = list()
            for child in children_schemas:
                child_dict = dict(ref=child["ref"],
                                  id=child["id"].split(".")[-1],
                                  prefix=child["prefix"],
                                  vals=[])
                attr_list.append(child_dict)
                for rec in records:
                    child_dict["vals"].append(rec.get(child_dict["id"], str()))

            if attr_list:
                attr_df = pd.DataFrame(attr_list)
                attr_df.index = attr_df.id
                schema_df = pd.concat([schema_df, attr_df])

        schema_df.rename(index=str,
                         columns={
                             "ref": "dc",
                             "id": "copo_id"
                         },
                         inplace=True)

        dc_list = schema_df.to_dict('records')

        return dc_list
Example #11
0
def zip_file(request):
    # need to get a reference to the file to zip
    file_id = request.GET['file_id']
    print("zip started " + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)

    # get the name of the file to zip and change its suffix to .gz
    output_file_location = os.path.join(settings.MEDIA_ROOT,
                                        file_obj.file.name)
    output_file_name = file_obj.filename + '.gz'
    try:
        # open the file as gzip acrchive...set compression level
        temp_name = os.path.join(settings.MEDIA_ROOT,
                                 str(uuid.uuid4()) + '.tmp')
        myzip = gzip.open(temp_name, 'wb', compresslevel=1)
        src = open(output_file_location, 'r')

        # write input file to gzip archive in n byte chunks
        n = 100000000
        for chunk in iter(lambda: src.read(n), ''):
            myzip.write(bytes(chunk, 'UTF-8'))
    finally:
        myzip.close()
        src.close()

    print('zip complete ' + file_id)
    # now need to delete the old file and update the file record with the new file
    new_file_name = output_file_location + '.gz'
    os.rename(temp_name, new_file_name)
    os.remove(output_file_location)

    # calculate new file size
    stats = os.stat(new_file_name)
    new_file_size = stats.st_size / 1000 / 1000

    # update filename
    file_obj.filename = output_file_name
    file_obj.file.name = new_file_name

    # update file size
    file_obj.offset = stats.st_size
    file_obj.save()

    out = {
        'zipped': True,
        'file_name': output_file_name,
        'file_size': new_file_size
    }

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field(
        "file_size")] = u.filesize_toString(file_obj.offset)
    auto_fields[DataFile().get_qualified_field("name")] = output_file_name
    auto_fields[DataFile().get_qualified_field(
        "file_location")] = new_file_name

    profile_id = request.session['profile_id']
    component = "datafile"

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component=component,
             auto_fields=auto_fields).do_save_edit()

    out = jsonpickle.encode(out)
    return HttpResponse(out, content_type='json')