Esempio n. 1
0
    def inherit_metadata(self, reference_target_id):
        """
        using reference_target as the basis, copy metadata across to description targets
        :param reference_target_id:
        :return:
        """

        reference_description = DataFile().get_record(reference_target_id).get("description", dict())

        reference_attributes = reference_description.get("attributes", dict())
        reference_stages = reference_description.get("stages", list())

        for target in self.description_targets:
            # 'focus' on target
            self.set_datafile_id(target["recordID"])

            # use batch stages to update targets
            self.update_datafile_stage(reference_stages)

            # find and attributes from the reference
            for k, v in reference_attributes.items():
                if k not in self.get_datafile_attributes():
                    self.update_datafile_attributes({'ref': k, 'data': v})

        self.update_targets_datafiles()
        return
Esempio n. 2
0
    def get_datafiles_rating(self):
        """
        function handles the evaluation of metadata rating for datafiles
        :return: dictionary of datafiles with associated metadata rating
        """

        datafiles_rating = list()

        for df_id in self.item_ids:
            default_rating = \
                d_utils.json_to_pytype(lkup.METADATA_RATING_TEMPLATE_LKUPS["rating_template"])["properties"][-1]
            item_rating = dict()
            item_rating["rating_level"] = default_rating["rating_level"]
            item_rating["rating_level_description"] = default_rating[
                "rating_level_description"]

            d_r = dict(item_id=df_id, item_rating=item_rating)

            attributes = DataFile().get_record_property(
                df_id, "description_attributes")
            deposition_context = DataFile().get_record_property(
                df_id, "target_repository")

            if deposition_context:
                d_r["item_rating"] = self.rate_metadata(
                    attributes, deposition_context)

            datafiles_rating.append(d_r)

        return datafiles_rating
Esempio n. 3
0
def hash_upload(request):
    # utility method to create an md5 hash of a given file path
    # open uploaded file
    file_id = request.GET['file_id']
    print('hash started ' + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)
    file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name)

    # now hash opened file
    md5 = hashlib.md5()
    with open(file_name, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            md5.update(chunk)

    file_obj.hash = md5.hexdigest()
    file_obj.save()

    output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id}

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash

    profile_id = request.session['profile_id']
    component = "datafile"

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component=component,
             auto_fields=auto_fields).do_save_edit()

    out = jsonpickle.encode(output_dict)
    print('hash complete ' + file_id)
    return HttpResponse(out, content_type='json')
Esempio n. 4
0
def get_datafiles_json(target_id=None):
    """
    returns all datafile record
    :return:
    """
    from dal.copo_da import DataFile
    profile_id = get_current_request().session['profile_id']

    if target_id:
        datafiles = list()
        datafiles.append(DataFile().get_record(target_id))
    else:
        datafiles = DataFile(profile_id).get_all_records()

    value_field = str("id")
    label_field = str("datafile_name")
    search_field = ["id", "datafile_name"]
    secondary_label_field = ["meta_datafile_name"]

    elem_json = dict(value_field=value_field,
                     label_field=label_field,
                     secondary_label_field=secondary_label_field,
                     search_field=search_field,
                     options=list())

    for sd in datafiles:
        elem_json.get("options").append({
            value_field: str(sd["_id"]),
            label_field: sd["name"],
            secondary_label_field[0]: sd["name"]
        })

    return elem_json
Esempio n. 5
0
def generate_copo_datafiles_data(profile_id, data_file=None):
    d = DataFile(profile_id)

    # branch out, if a single record is provided
    if data_file:
        chunked_upload = ChunkedUpload.objects.get(id=int(data_file["file_id"]))
        row = [chunked_upload.filename, str(data_file["_id"])]
        return {"row_data": row, "table_id": "datafile_table"}

    datafiles = d.get_all_datafiles()

    # headers
    columns = [{"title": "File"}, {"title": " "}]
    dataSet = []

    # data
    for df in datafiles:
        # get details of the file from the file object
        chunked_upload = ChunkedUpload.objects.get(id=int(df["file_id"]))
        row = [chunked_upload.filename, str(df["_id"])]
        dataSet.append(row)

    # define action buttons for the table. ALWAYS include the class 'copo-dt' in className!!!
    action_buttons = [
        {'text': 'Describe', 'className': 'copo-dt btn btn-primary', 'iconClass': 'fa fa-tags'},
        {'text': 'Delete', 'className': 'copo-dt btn btn-danger', 'iconClass': 'fa fa-trash-o'}
    ]
    return {"columns": columns, "dataSet": dataSet, "table_id": "datafile_table", "action_buttons": action_buttons}
Esempio n. 6
0
def hash_upload(request):
    # utility method to create an md5 hash of a given file path
    # open uploaded file
    file_id = request.GET['file_id']
    print('hash started ' + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)
    file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name)

    # now hash opened file
    md5 = hashlib.md5()
    with open(file_name, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            md5.update(chunk)

    file_obj.hash = md5.hexdigest()
    file_obj.save()

    output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id}

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component="datafile",
             auto_fields=auto_fields
             ).do_save_edit()

    out = jsonpickle.encode(output_dict)
    print('hash complete ' + file_id)
    return HttpResponse(out, content_type='json')
Esempio n. 7
0
    def submit(self, sub_id, dataFile_ids):

        # physically transfer files
        path2library = os.path.join(BASE_DIR, REPOSITORIES['ASPERA']['resource_path'])

        # change these to be collected properly
        user_name = REPOSITORIES['ASPERA']['user_token']
        password = REPOSITORIES['ASPERA']['password']

        # create transfer record
        transfer_token = RemoteDataFile().create_transfer(sub_id)['_id']
        self.submission = Submission().get_record(sub_id)

        self.profile = Profile().get_record(self.submission['profile_id'])
        remote_path = d_utils.get_ena_remote_path(sub_id)

        # get each file in the bundle
        file_path = []
        for idx, f_id in enumerate(dataFile_ids):
            mongo_file = DataFile().get_record(f_id)
            self.d_files.append(mongo_file)
            file_path.append(mongo_file.get("file_location", str()))

        self._do_aspera_transfer(transfer_token=transfer_token,
                                 user_name=user_name,
                                 password=password,
                                 remote_path=remote_path,
                                 file_path=file_path,
                                 path2library=path2library,
                                 sub_id=sub_id)
Esempio n. 8
0
def save_stage_data(auto_fields):
    d = DataFile()
    datafile_id = auto_fields["datafile"]
    current_stage = auto_fields["current_stage"]

    description_stages = d.GET(datafile_id)["description"]["stages"]

    stage = [elem for elem in description_stages if elem["ref"] == current_stage]

    # get schema for resolving ontology terms
    onto_schema = d_utils.json_to_pytype(lkup.DB_TEMPLATES["ONTOLOGY_ANNOTATION"])

    if stage:
        data = {}
        stage_items = stage[0]["items"]
        if stage_items:
            for sti in stage_items:
                # handle ontology term
                if sti["control"].lower() == "ontology term":
                    a = {}
                    for k in onto_schema["properties"]:
                        if sti["id"] + "." + k in auto_fields.keys():
                            a[k] = auto_fields[sti["id"] + "." + k]
                    data[sti["id"]] = a
                else:
                    data[sti["id"]] = auto_fields[sti["id"]]

    d.save_description_stage(datafile_id, {"ref": current_stage, "data": data})
Esempio n. 9
0
    def do_sanitise_submissions(self):

        records = self.da_object.get_all_records()

        for submission in records:
            if "bundle_meta" not in submission:
                bundle_meta = list()

                for file_id in submission.get("bundle", list()):
                    datafile = DataFile().get_record(file_id)
                    if datafile:
                        upload_status = False

                        if str(submission.get("complete",
                                              False)).lower() == 'true':
                            upload_status = True
                        bundle_meta.append(
                            dict(file_id=file_id,
                                 file_path=datafile.get(
                                     "file_location", str()),
                                 upload_status=upload_status))
                submission["bundle_meta"] = bundle_meta
                submission['target_id'] = str(submission.pop('_id'))
                self.da_object.save_record(dict(), **submission)

        self.context["sanitise_status"] = True

        return self.context
Esempio n. 10
0
def zip_file(request):
    # need to get a reference to the file to zip
    file_id = request.GET['file_id']
    print("zip started " + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)

    # get the name of the file to zip and change its suffix to .gz
    output_file_location = os.path.join(settings.MEDIA_ROOT, file_obj.file.name)
    output_file_name = file_obj.filename + '.gz'
    try:
        # open the file as gzip acrchive...set compression level
        temp_name = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()) + '.tmp')
        myzip = gzip.open(temp_name, 'wb', compresslevel=1)
        src = open(output_file_location, 'r')

        # write input file to gzip archive in n byte chunks
        n = 100000000
        for chunk in iter(lambda: src.read(n), ''):
            myzip.write(bytes(chunk, 'UTF-8'))
    finally:
        myzip.close()
        src.close()

    print('zip complete ' + file_id)
    # now need to delete the old file and update the file record with the new file
    new_file_name = output_file_location + '.gz'
    os.rename(temp_name, new_file_name)
    os.remove(output_file_location)

    # calculate new file size
    stats = os.stat(new_file_name)
    new_file_size = stats.st_size / 1000 / 1000

    # update filename
    file_obj.filename = output_file_name
    file_obj.file.name = new_file_name

    # update file size
    file_obj.offset = stats.st_size
    file_obj.save()

    out = {'zipped': True, 'file_name': output_file_name, 'file_size': new_file_size}

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field("file_size")] = u.filesize_toString(file_obj.offset)
    auto_fields[DataFile().get_qualified_field("name")] = output_file_name
    auto_fields[DataFile().get_qualified_field("file_location")] = new_file_name

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component="datafile",
             auto_fields=auto_fields
             ).do_save_edit()

    out = jsonpickle.encode(out)
    return HttpResponse(out, content_type='text/plain')
Esempio n. 11
0
def ena_description(auto_fields):
    # get current stage, output next stage
    stage_dict = {}
    datafile_id = auto_fields["datafile"]
    current_stage = auto_fields["current_stage"]
    static_list = d_utils.json_to_pytype(lkup.WIZARD_FILES["ena"])["properties"]
    unified_list = static_list

    # get stages from db if exist. stages (both static and dynamic) are held in the db,
    # to provide a single or unified point of reference

    d = DataFile()
    description = d.GET(datafile_id)["description"]
    description_stages = description["stages"]

    if len(description_stages) > 0:
        unified_list = description_stages
    else:
        description["stages"] = unified_list
        fields = {"description": description}
        d.edit_datafile(datafile_id, fields)

    # generate and merge dynamic stages with static if not already generated
    if unified_list == static_list:  # only static stages exist, generate dynamic

        dynamic_elements = get_dynamic_elements_ena(datafile_id)  # ENA dynamic stages, contingent upon study_type

        if dynamic_elements:
            unified_list = unified_list + dynamic_elements  # merge and save stages

            description["stages"] = unified_list
            fields = {"description": description}
            d.edit_datafile(datafile_id, fields)

    # now, resolve stages for the wizard
    next_stage_indx = 0
    listed_stage = [indx for indx, elem in enumerate(unified_list) if elem["ref"] == current_stage]
    if listed_stage:
        next_stage_indx = listed_stage[0] + 1

    try:
        elem = unified_list[next_stage_indx]
        if not is_stage_present(datafile_id, elem["ref"]):
            stage_dict = get_stage_display(elem, datafile_id)
    except:
        pass

    if not stage_dict and current_stage == unified_list[-1]["ref"]:  # reached last stage of wizard, 'review' now
        # stage_dict = wtags.get_review_html(get_stages_display(datafile_id))
        pass

    return stage_dict
Esempio n. 12
0
    def setUpClass(cls):
        settings.UNIT_TESTING = True
        # create user
        cls.user = User.objects.create_user(username='******', first_name="jonny", last_name="appleseed",
                                            email='*****@*****.**', password='******')
        cls.user.save()

        # create profile
        p_dict = {"copo_id": "000000000", "description": "Test Description", "user_id": 1, "title": "Test Title"}
        cls.pid = Profile().save_record(dict(), **p_dict)

        # create datafile
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_datafile_cgcore.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        p_dict["file_location"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png")
        p_dict["name"] = "fish.png"
        profile = Profile().get_collection_handle().find_one({"copo_id": "000000000"})
        p_dict["profile_id"] = str(cls.pid["_id"])
        cls.d = DataFile().get_collection_handle().insert(p_dict)

        # create submission
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures",
                         "dummy_cgcore_dataverse_submission_existing.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        p_dict["bundle_meta"][0]["file_path"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures",
                                                             "fish.png")
        p_dict["bundle_meta"][0]["file_id"] = str(cls.d)
        p_dict["profile_id"] = str(cls.pid["_id"])
        p_dict["bundle"].append(str(cls.d))
        cls.s_dv = Submission().get_collection_handle().insert(p_dict)
Esempio n. 13
0
def get_info_for_new_dataverse(request):
    # method to prepopulate dataverse creation form with currently available metadata values
    out = dict()
    p_id = request.session['profile_id']
    profile = Profile().get_record(p_id)
    out['dvAlias'] = str(profile['title']).lower()
    person_list = list(Person(p_id).get_people_for_profile())
    out['dvPerson'] = person_list
    orcid = Orcid().get_orcid_profile(request.user)
    try:
        affiliation = orcid.get('op', {}).get('activities_summary', {}).get('employments', {}) \
            .get('employment_summary', {})[0].get('organization', "").get('name', "")
    except:
        affiliation = ""
    out['dsAffiliation'] = affiliation
    df = list(DataFile().get_for_profile(p_id))
    file = df[0]
    out['dvName'] = profile.get('title', "")
    out['dsTitle'] = file.get('description', {}).get('attributes', {}) \
        .get('title_author_contributor', {}).get('dcterms:title', "")
    out['dsDescriptionValue'] = file.get('description', {}).get('attributes', {}) \
        .get('subject_description', {}).get('dcterms:description', "")
    out['dsSubject'] = file.get('description', {}).get('attributes', {}) \
        .get('subject_description', {}).get('dcterms:subject', "")
    return HttpResponse(json_util.dumps(out))
Esempio n. 14
0
def annotate_meta(request, file_id):
    if "ss_data" in request.session:
        del request.session["ss_data"]
    if "ss_sheet_names" in request.session:
        del request.session["ss_sheet_names"]
    df = DataFile().get_record(ObjectId(file_id))
    name = df["name"]
    if name.endswith(('xls', 'xlsx')):
        return render(request, 'copo/copo_annotate_spreadsheet.html',
                      {'file_id': file_id, 'file_name': name, 'file_type': "ss"})
    elif name.endswith("csv"):
        return render(request, 'copo/copo_annotate_spreadsheet.html',
                      {'file_id': file_id, 'file_name': name, 'file_type': "csv"})
    elif name.endswith(("txt", "tsv")):
        return render(request, 'copo/copo_annotate_spreadsheet.html',
                      {'file_id': file_id, 'file_name': name, 'file_type': "tab"})
    elif name.endswith(('pdf')):
        html = ""
        records = Annotation().get_all_records()
        if "annotation_html" not in request.session:
            # if True:
            folder_name = str(uuid.uuid1())
            full_path = os.path.join(settings.MEDIA_ROOT, folder_name)
            os.makedirs(full_path)
            run("ebook-convert  " + df[
                "file_location"] + " " + full_path + " --no-images --pretty-print --insert-blank-line")
            with open(os.path.join(full_path, "index.html"), 'r') as f:
                html = f.read()
            shutil.rmtree(full_path)
            request.session["annotation_html"] = html
        else:
            print("using session text data")
            html = request.session["annotation_html"]
        return render(request, 'copo/copo_annotate_pdf.html',
                      {'html': html, 'file_id': file_id, 'file_name': name, "file_type": "pdf"})
Esempio n. 15
0
    def _make_dataset_xml(self, sub):
        meta = sub['meta']

        # iterate through meta to get fields
        d = dict()
        datafile = DataFile().get_record(ObjectId(sub['bundle'][0]))
        df = datafile['description']['attributes']

        xml = '<?xml version="1.0"?>'
        xml = xml + '<entry xmlns="http://www.w3.org/2005/Atom" xmlns:dcterms="http://purl.org/dc/terms/">'
        xml = xml + '<dcterms:contributor>' + "*****@*****.**" + '</dcterms:contributor>'

        for item in meta["fields"]:

            if type(item["vals"]) == type(""):
                tail = item["dc"].split(".")[1]
                xml = xml + "<dcterms:" + tail + '>' + item[
                    "vals"] + "</dcterms:" + tail + '>'

            elif type(item["vals"] == type(list())):
                for val in item["vals"]:
                    tail = item["dc"].split(".")[1]
                    xml = xml + '<dcterms:' + tail + '>' + val + '</dcterms:' + tail + '>'

        xml = xml + "</entry>"
        path = os.path.dirname(datafile['file_location'])
        xml_path = os.path.join(path, 'xml.xml')
        with open(xml_path, 'w+') as f:
            f.write(xml)
        return xml_path
Esempio n. 16
0
def do_submission_xml(sub_id):
    sub = Submission().get_record(sub_id)
    dfs = list()
    for d in sub["bundle"]:
        dfs.append(DataFile().get_record(d))
    df = dfs[0]

    submission = Element("SUBMISSION")
    # get names of files in bundle and append here
    # do alias
    alias = make_alias(sub)
    submission.set("alias", alias + "_sub")
    submission.set(
        "broker_name",
        df["description"]["attributes"]["study_type"]["study_broker"])
    submission.set(
        "center_name", df["description"]["attributes"]["study_type"]
        ["study_analysis_center_name"])
    submission_date = datetime.datetime.now().isoformat()
    submission.set("submission_date", submission_date)
    submission.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
    submission.set(
        "xsi:noNamespaceSchemaLocation",
        "ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.submission.xsd")

    contacts = Element("CONTACTS")
    copo_contact = Element("CONTACT")
    copo_contact.set("inform_on_error", "*****@*****.**")
    copo_contact.set("inform_on_status", "*****@*****.**")
    copo_contact.set("name", "COPO Support")
    contacts.append(copo_contact)

    people = Person(sub["profile_id"]).get_people_for_profile()
    for p in people:
        c = Element("CONTACT")
        c.set("name", p["firstName"] + " " + p["lastName"])
        if [
                x for x in p["roles"]
                if x["annotationValue"] == "SRA Inform On Status"
        ]:
            c.set("inform_on_status", p["email"])
        if [
                x for x in p["roles"]
                if x["annotationValue"] == "SRA Inform On Error"
        ]:
            c.set("inform_on_error", p["email"])
        contacts.append(c)
    submission.append(contacts)

    actions = Element("ACTIONS")
    action = Element("ACTION")
    add = Element("ADD")
    add.set("schema", "analysis")
    add.set("source", "analysis.xml")
    action.append(add)
    actions.append(action)
    submission.append(actions)

    return prettify(submission)
Esempio n. 17
0
def make_alias(sub):
    bundle = sub['bundle']
    filenames = ""
    for b in bundle:
        file = DataFile().get_record(b)
        filenames = filenames + "-" + file['name']
    alias = str(sub["_id"]) + ':' + sub['repository'] + ":" + filenames
    return alias
Esempio n. 18
0
 def tearDownClass(cls):
     u = User.objects.get(pk=1)
     u.delete()
     Profile().get_collection_handle().remove({"copo_id": "000000000"})
     DataFile().get_collection_handle().remove({"_id": cls.d})
     # Submission().get_collection_handle().remove({"_id": cls.s_dv})
     Submission().get_collection_handle().remove({"_id": cls.s_ckan_new})
     Submission().get_collection_handle().remove({"_id": cls.s_ckan_existing})
Esempio n. 19
0
 def tearDownClass(cls):
     u = User.objects.get(username=settings.TEST_USER_NAME)
     u.delete()
     Profile().get_collection_handle().remove({"copo_id": "000000000"})
     DataFile().get_collection_handle().remove({"test_file": True})
     Repository().get_collection_handle().remove({"_id": cls.r["_id"]})
     Submission().get_collection_handle().remove({"_id": cls.s_dv})
     Submission().get_collection_handle().remove({"_id": cls.s_ds_new})
     Submission().get_collection_handle().remove({"_id": cls.s_ds_existing})
Esempio n. 20
0
    def send_files(self, sub, ds):

        for id in sub['bundle']:
            file = DataFile().get_record(ObjectId(id))
            file_location = file['file_location']
            file_name = file['name']
            with open(file_location, 'rb') as f:
                contents = f.read()
                ds.upload_file(file_name, contents, zip_files=False)
Esempio n. 21
0
    def do_description_summary(self):
        record = DataFile().get_record(self.param_dict.get("target_id"))
        self.context['description'] = htags.resolve_description_data(
            record.get("description", dict()), dict())

        description_token = record.get('description_token', str())
        self.context['description']['description_record'] = dict()

        if description_token:
            description_record = Description().GET(description_token)
            if description_record:
                if not description_record["name"]:
                    description_record["name"] = "N/A"
                self.context['description']['description_record'] = dict(
                    name=description_record["name"],
                    id=str(description_record["_id"]))

        return self.context
Esempio n. 22
0
def automate_num_cols(request):
    file_id = request.GET.get("file_id", "")
    file_obj = DataFile().get_record(file_id)
    try:
        d = pandas.read_csv(file_obj["file_location"], nrows=4)
    except UnicodeDecodeError as e:
        d = pandas.read_excel(file_obj["file_location"], nrows=4)
    headers = d.columns.values.tolist()
    cols = len(d.columns)
    output = {"num": cols, "headers": headers}
    return HttpResponse(json.dumps(output))
Esempio n. 23
0
    def submit(self, sub_id, dataFile_ids):
        submission_record = Submission().get_record(sub_id)

        # bundle_meta, if present, should provide a better picture of what datafiles need to be uploaded
        if "bundle_meta" in submission_record:
            pending_files = [
                x["file_id"] for x in submission_record['bundle_meta']
                if not x["upload_status"]
            ]
            dataFile_ids = pending_files

        # physically transfer files
        path2library = os.path.join(BASE_DIR,
                                    REPOSITORIES['ASPERA']['resource_path'])

        # change these to be collected properly
        user_name = REPOSITORIES['ASPERA']['user_token']
        password = REPOSITORIES['ASPERA']['password']

        # create transfer record
        transfer_token = RemoteDataFile().create_transfer(sub_id)['_id']
        self.submission = Submission().get_record(sub_id)

        self.profile = Profile().get_record(self.submission['profile_id'])
        remote_path = d_utils.get_ena_remote_path(sub_id)

        # get each file in the bundle
        file_path = []
        for idx, f_id in enumerate(dataFile_ids):
            mongo_file = DataFile().get_record(ObjectId(f_id))
            self.d_files.append(mongo_file)
            file_path.append(mongo_file.get("file_location", str()))

        case = self._do_aspera_transfer(transfer_token=transfer_token,
                                        user_name=user_name,
                                        password=password,
                                        remote_path=remote_path,
                                        file_path=file_path,
                                        path2library=path2library,
                                        sub_id=sub_id)
        return case
Esempio n. 24
0
def delete_annotation(request):
    col_idx = request.GET["col_idx"]
    sheet_name = request.GET["sheet_name"]
    file_id = request.GET["file_id"]
    iri = request.GET["iri"]
    uid = request.user.id

    doc = Annotation().decrement_or_delete_annotation(uid, iri)
    doc = DataFile().delete_annotation(col_idx=col_idx,
                                       sheet_name=sheet_name,
                                       file_id=file_id)
    return HttpResponse("Hello World")
Esempio n. 25
0
def do_study_xml(sub_id):
    # get submission object from mongo
    sub = Submission().get_record(sub_id)
    # get datafile objects
    dfs = list()
    for d in sub["bundle"]:
        dfs.append(DataFile().get_record(d))
    df = dfs[0]
    # get profile object
    p = Profile().get_record(df["profile_id"])

    # Do STUDY_SET
    study_set = Element("STUDY_SET")
    study_set.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
    study_set.set("xsi:noNamespaceSchemaLocation",
                  "ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.study.xsd")

    # Do STUDY
    study = Element("STUDY")
    study.set("alias", str(sub["_id"]))
    study.set(
        "center_name", df["description"]["attributes"]["study_type"]
        ["study_analysis_center_name"])
    study_set.append(study)

    # Do DESCRIPTOR
    descriptor = Element("DESCRIPTOR")
    # create element, append to parent and add text
    SubElement(descriptor, "STUDY_TITLE").text = p["title"]
    study_type = Element("STUDY_TYPE")
    es = get_study_type_enumeration(
        df["description"]["attributes"]["study_type"]["study_type"])
    # es = df["description"]["attributes"]["study_type"]["study_type"]
    study_type.set("existing_study_type", es)
    descriptor.append(study_type)
    SubElement(descriptor, "STUDY_ABSTRACT").text = p["description"]
    study.append(descriptor)

    # Do STUDY_ATTRIBUTES
    study_attributes = Element("STUDY_ATTRIBUTES")
    # do attribute for date
    study_attribute = Element("STUDY_ATTRIBUTE")
    SubElement(study_attribute, "TAG").text = "Submission Date"
    SubElement(study_attribute,
               "VALUE").text = datetime.datetime.now().strftime('%Y-%m-%d')
    study_attributes.append(study_attribute)

    # here we can loop to add other STUDY_ATTRIBUTES

    study.append(study_attributes)

    return prettify(study_set)
Esempio n. 26
0
def refresh_display(request):
    file_id = request.GET["file_id"]
    file = DataFile().get_record(file_id)
    path = file["file_location"]
    data = list()
    filetype = None
    if file["name"].endswith("csv"):
        filetype = "csv"
    elif file["name"].endswith(("txt", "tsv")):
        filetype = "tab"
    elif file["name"].endswith(("xls", "xlsx")):
        filetype = "xls"
    if "ss_data" in request.session:
        # if data previously loaded then just load from session
        data = json_util.loads(request.session["ss_data"])
        sheet_names = json_util.loads(request.session["ss_sheet_names"])
    else:
        try:
            sheet_names = pandas.ExcelFile(path).sheet_names
        except Exception as e:
            # support CSV here (N.B. CSV does not support multiple sheets)
            sheet_names = [file["name"]]

        # read entire spreadsheet
        if filetype == "xls":
            for name in sheet_names:
                d = pandas.read_excel(path, sheet_name=name, nrows=4).fillna(0)
                out = list()
                out.append(d.columns.tolist())
                out.extend(d.values.tolist())
                data.append(out)
            try:
                request.session["ss_data"] = json_util.dumps(data)
                request.session["ss_sheet_names"] = json_util.dumps(
                    sheet_names)
            except:
                pass
        elif filetype == "csv":
            d = pandas.read_csv(path, nrows=4)
            d = d.fillna('')
            out = list()
            out.append(d.columns.tolist())
            out.extend(d.values.tolist())
            data.append(out)
        elif filetype == "tab":
            d = pandas.read_csv(path, sep='\t', nrows=4)
            d = d.fillna('')
            out = list()
            out.append(d.columns.tolist())
            out.extend(d.values.tolist())
            data.append(out)
    return HttpResponse(json_util.dumps({"data": data, "names": sheet_names}))
Esempio n. 27
0
    def do_un_describe(self):
        datafile_ids = [
            ObjectId(i) for i in self.param_dict.get("datafile_ids")
        ]

        DataFile().get_collection_handle().update_many(
            {"_id": {
                "$in": datafile_ids
            }}, {"$set": {
                "description": dict()
            }})

        return self.context
Esempio n. 28
0
def resolve_submission_id(request, submission_id):
    sub = Submission().get_record(submission_id)
    # get all file metadata
    output = dict()
    files = list()
    for f in sub.get("bundle", list()):
        file = DataFile().get_record(f)
        files.append(file["description"]["attributes"])
    output["files"] = files
    output["accessions"] = sub["accessions"]
    output["metadata"] = {}
    output["metadata"]["dc"] = sub["meta"]["fields"]
    return HttpResponse(j.dumps(output))
Esempio n. 29
0
    def do_assembly_submission(self, sub_id, remote_path, transfer_token):
        # make dir for manifest file
        conv_dir = os.path.join(self._dir, sub_id)
        if not os.path.exists(conv_dir):
            os.makedirs(conv_dir)
        # file for metadata
        sub = Submission().get_record(sub_id)
        datafile = DataFile().get_record(sub["bundle"][0])
        metadata = datafile["description"]["attributes"]["study_type"]
        # make manifest
        with open(os.path.join(conv_dir, "manifest.manifest"),
                  'w+') as manifest:
            for key in metadata.keys():
                line = key.upper() + "\t" + metadata[key] + "\n"
                manifest.write(line)
            agp_flag = False
            fasta_flag = False
            for f in sub["bundle"]:
                file = DataFile().get_record(ObjectId(f))
                if file["name"].endswith("fasta"):
                    fasta = "FASTA\t" + file["file_location"] + "\n"
                    fasta_flag = True
                    manifest.write(fasta)
                if file["name"].endswith('agp'):
                    agp = "AGP\t" + file["file_location"] + "\n"
                    agp_flag = True
                    manifest.write(agp)
        if agp_flag and fasta_flag:

            #  proceed to submission
            pass
        else:
            return {
                "status": 428,
                "message": "You must supply an AGP file and a FASTA file"
            }
Esempio n. 30
0
    def resolve_deposition_context(self):
        """
        this returns an inferred deposition destination for a datafile.
        we assume here that the target destination of the file can be inferred based on its type
        :param:
        :return string destination:
        """

        # get file details
        datafile = DataFile().get_record(self.datafile_id)
        ft = datafile.get("file_type", "unknown")

        if ft == '':
            ft = 'unknown'

        deposition_context = 'default'

        # match against documented destinations
        for k, v in lkup.REPO_FILE_EXTENSIONS.items():
            if ft in v:
                deposition_context = k
                break

        return deposition_context
Esempio n. 31
0
def send_file_annotation(request):
    col_idx = request.POST["col_idx"]
    sheet_name = request.POST["sheet_name"]
    col_header = request.POST["col_header"]
    iri = request.POST["iri"]
    label = request.POST["label"]
    id = request.POST["id"]
    obo_id = request.POST.get("obo_id", "")
    ontology_name = request.POST["ontology_name"]
    ontology_prexfix = request.POST["ontology_prefix"]
    short_form = request.POST["short_form"]
    type = request.POST["type"]
    file_id = request.POST["file_id"]
    file_name = request.POST["file_name"]
    description = request.POST["description"]
    data = {
        "column_idx": col_idx,
        "column_header": col_header,
        "sheet_name": sheet_name,
        "iri": iri,
        "obo_id": obo_id,
        "label": label,
        "id": id,
        "ontology_name": ontology_name,
        "ontology_prefix": ontology_prexfix,
        "short_form": short_form,
        "type": type,
        "description": description,
        "uid": request.user.id,
        "file_id": file_id,
        "file_name": file_name
    }
    if Annotation().add_or_increment_term(data):
        annotations = DataFile().update_file_level_metadata(file_id, data)
    else:
        annotations = {"status": 500, "message": "Could not add annotation"}
    return HttpResponse(json_util.dumps({"annotation": annotations}))
Esempio n. 32
0
def data_wiz(request):
    context = {}
    step = int(request.POST['step'])
    datafile = request.POST['datafile']
    d = DataFile()

    dispatch_description = {
        'ena': wizh.ena_description,
        'figshare': wizh.figshare_description,
        'default': wizh.default_description
    }

    if step == 1:
        # first stage in the process where the wizard has only just been initiated
        # infer the deposition context (target repository) based on the file
        # if we can infer the context, switch immediately to that, else request destination from user

        description_attributes = d.GET(datafile)['description']['attributes']

        try:
            deposition_context = description_attributes['deposition_context']['deposition_context']
        except:
            deposition_context = None

        if not deposition_context:
            # try to resolve, and save deposition context as an implicit stage
            deposition_context = wizh.get_deposition_context(datafile)

            if deposition_context:
                d.save_description_stage(datafile, {'ref': 'deposition_context',
                                                    'data': {'deposition_context': deposition_context}})

        if deposition_context:
            # as there might be previous description data,
            # we want to be able to load this, else follow the step-wise wizard path
            if len(description_attributes) > 1:
                context['stages'] = wizh.get_stages_display(datafile)
            else:  # follow the step-wise wizard path
                auto_fields = {"current_stage": "deposition_context", "datafile": datafile}
                context['stage'] = dispatch_description[deposition_context](auto_fields)

        else:  # if we couldn't infer deposition context, ask user's intervention
            df_wizard_dict = copy.deepcopy(lkup.DF_WIZARD)
            context['stage'] = {
                "title": "Verify Destination",
                "content": wtags.get_deposition_html(deposition_context)
            }

    elif step >= 2:
        auto_fields = ast.literal_eval(request.POST['auto_fields'])
        deposition_context = auto_fields["deposition_context"]

        if deposition_context == "":
            deposition_context = "default"

        auto_fields["datafile"] = datafile
        context['stage'] = dispatch_description[deposition_context](auto_fields)
    elif step == -1:  # save all stages
        auto_fields = ast.literal_eval(request.POST['auto_fields'])
        # get the deposition context from the first element
        for a_f in auto_fields:
            auto_field = ast.literal_eval(a_f)
            auto_field["datafile"] = datafile
            wizh.save_stage_data(auto_field)

    out = jsonpickle.encode(context)
    return HttpResponse(out, content_type='json')
Esempio n. 33
0
def refresh_annotations(request):
    file_id = request.GET["file_id"]
    sheet_name = request.GET["sheet_name"]
    annotations = DataFile().get_file_level_metadata_for_sheet(
        file_id, sheet_name)
    return HttpResponse(json_util.dumps({"annotations": annotations}))
Esempio n. 34
0
    def _submit(self, sub_id, dataFile_ids):

        for f_id in dataFile_ids:

            mongo_file = DataFile().get_record(f_id)

            c = ChunkedUpload.objects.get(pk=int(mongo_file["file_id"]))

            file_path = os.path.join(self.MEDIA_ROOT, str(c.file))
            orig_name = c.filename

            sub = mongo_file['description']['attributes']
            data = dict()
            data['defined_type'] = sub.get('type_category', dict()).get('type')
            data['title'] = sub.get('title_author_description',
                                    dict()).get('title')
            authors = sub.get('title_author_description',
                              dict()).get('author').split(',')
            lst = list()
            for x in authors:
                lst.append({'name': x})
            data['authors'] = lst
            data['description'] = sub.get('title_author_description',
                                          dict()).get('description')
            cat = sub.get('type_category', dict()).get('categories')
            if cat:
                cat = cat.split(',')
                cat = list(map(int, cat))
                data['categories'] = cat
            else:
                data['categories'] = list()
            data['tags'] = sub.get('tags', dict()).get('keywords').split(',')
            for idx, t in enumerate(data['tags']):
                if len(t) < 3:
                    if len(t) == 1:
                        t = t + (2 * t)
                    elif len(t) == 2:
                        t = t + t
                    data['tags'][idx] = t

            data['references'] = sub.get('tags',
                                         dict()).get('references').split(',')
            for idx, x in enumerate(data['references']):
                if x != '':
                    if (not x.startswith('http')) or (
                            not x.startswith('https')):
                        if (not x.startswith('www')):
                            data['references'][idx] = 'http://www.' + x
                        else:
                            data['references'][idx] = 'http://' + x
            if len(data['references']) == 1 and data['references'][0] == '':
                # if blank ref, pop
                data.pop('references')
            data['funding'] = sub.get('tags', dict()).get('funding')
            data['licenses'] = sub.get('tags', dict()).get('licenses')
            data['publish'] = sub.get('figshare_publish',
                                      dict()).get('should_publish')

            # Create article
            #data = json.dumps({'title': orig_name, 'defined_type': 'figure'})
            endpoint = 'account/articles'
            resp = requests.post(self.BASE_URL.format(endpoint=endpoint),
                                 headers=self.HEADERS,
                                 data=json.dumps(data))

            article_id = json.loads(
                resp.content.decode('utf8'))['location'].rsplit('/', 1)[1]

            # Get file info
            #with open(file_path, 'rb') as fin:
            #    fin.seek(0, 2)  # Go to end of file
            #    size = fin.tell()
            size = c.offset
            info = json.dumps({'name': orig_name, 'size': size})

            # Initiate upload
            endpoint = 'account/articles/{}/files'.format(article_id)
            resp = requests.post(self.BASE_URL.format(endpoint=endpoint),
                                 headers=self.HEADERS,
                                 data=info)

            file_id = json.loads(
                resp.content.decode('utf-8'))['location'].rsplit('/', 1)[1]

            # Get upload/parts info
            endpoint = 'account/articles/{}/files/{}'.format(
                article_id, file_id)
            resp = requests.get(self.BASE_URL.format(endpoint=endpoint),
                                headers=self.HEADERS)

            url = '{upload_url}'.format(
                **json.loads(resp.content.decode('utf-8')))
            parts = json.loads(
                requests.get(url).content.decode('utf-8'))['parts']

            # start upload timer
            t = datetime.datetime.now()

            # Upload parts
            with open(file_path, 'rb') as fin:
                for idx, part in enumerate(parts):

                    percent_done = idx / len(parts) * 100
                    size = part['endOffset'] - part['startOffset'] + 1

                    address = '{}/{}'.format(url, part['partNo'])
                    x = datetime.datetime.now()
                    requests.put(address, data=fin.read(size))
                    delta = datetime.datetime.now() - x
                    # calculate current upload rate in MB per second
                    bw = (size / delta.total_seconds()) / 1000 / 1000
                    fields = {
                        'transfer_rate': bw,
                        'pct_completed': percent_done
                    }
                    RemoteDataFile().update_transfer(self.transfer_token,
                                                     fields)

            # Mark file upload as completed
            upload_time = datetime.datetime.now() - t
            requests.post(self.BASE_URL.format(endpoint=endpoint),
                          headers=self.HEADERS)

            fields = {
                'pct_completed': 100,
                'transfer_status': 'success',
                'completed_on': str(datetime.datetime.now()),
                'article_id': article_id
            }
            RemoteDataFile().update_transfer(self.transfer_token, fields)

            if data['publish'] == 'True':
                # publish api
                endpoint = 'account/articles/{}/publish'.format(article_id)
                resp = requests.post(self.BASE_URL.format(endpoint=endpoint),
                                     headers=self.HEADERS)
                location = json.loads(resp.content.decode('utf8'))['location']
                # get accession data
                endpoint = 'articles/{}'.format(article_id)
                resp = requests.get(self.BASE_URL.format(endpoint=endpoint),
                                    headers=self.HEADERS)
                # save accessions to mongo profile record
                s = Submission().get_record(sub_id)
                s['article_id'] = json.loads(
                    resp.content.decode('utf8'))['figshare_url']
                s['complete'] = True
                s['status'] = 'published'
                s['target_id'] = str(s.pop('_id'))
                Submission().save_record(dict(), **s)
            else:
                # save accessions to mongo profile record
                s = Submission().get_record(sub_id)
                s['article_id'] = article_id
                s['complete'] = True
                s['status'] = 'not published'
                s['target_id'] = str(s.pop('_id'))
                Submission().save_record(dict(), **s)

        # mark submission as complete
        Submission().mark_submission_complete(sub_id, article_id=article_id)
        Submission().mark_submission_complete(sub_id)
        Submission().mark_figshare_article_id(sub_id=sub_id,
                                              article_id=article_id)
Esempio n. 35
0
 def test_get_datafile(self):
     df = DataFile().get_record(self.d)
     self.assertEquals(df["name"], "fish.png")
Esempio n. 36
0
    def setUpClass(cls):
        cls.factory = RequestFactory()
        settings.UNIT_TESTING = True

        # create user
        cls.user = User.objects.create_user(username='******',
                                            first_name=settings.TEST_USER_NAME,
                                            last_name="appleseed",
                                            email='*****@*****.**',
                                            password='******')
        cls.user.save()

        # create profile
        p_dict = {
            "copo_id": "000000000",
            "description": "Test Description",
            "user_id": cls.user.id,
            "title": "Test Title"
        }
        cls.pid = Profile().save_record(dict(), **p_dict)

        # create datafile
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "fixtures", "dummy_datafile.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        p_dict["file_location"] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "fixtures",
            "fish.png")
        p_dict["name"] = "fish.png"
        profile = Profile().get_collection_handle().find_one(
            {"copo_id": "000000000"})
        p_dict["profile_id"] = str(cls.pid["_id"])
        cls.d = DataFile().get_collection_handle().insert(p_dict)

        # create dataverse repository
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "fixtures", "dummy_dataverse_repo.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        cls.r = Repository().save_record(dict(), **p_dict)

        # create submission record for dataverse
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "fixtures", "dummy_dataverse_submission.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        p_dict["bundle_meta"][0]["file_path"] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "fixtures",
            "fish.png")
        p_dict["bundle_meta"][0]["file_id"] = str(cls.d)
        p_dict["profile_id"] = str(cls.pid["_id"])
        p_dict["bundle"].append(str(cls.d))
        cls.s_dv = Submission().get_collection_handle().insert(p_dict)

        # create submission record for new dspace
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "fixtures", "dummy_dspace_submission.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        p_dict["bundle_meta"][0]["file_path"] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "fixtures",
            "fish.png")
        p_dict["bundle_meta"][0]["file_id"] = str(cls.d)
        p_dict["profile_id"] = str(cls.pid["_id"])
        p_dict["bundle"].append(str(cls.d))
        p_dict["meta"]["new_or_existing"] = "new"
        # query for item id
        resp = requests.post("http://demo.dspace.org/rest/collections")
        collections = json.loads(resp.content.decode("utf-8"))
        collection = collections[0]
        p_dict["meta"]["identifier"] = collection["uuid"]
        cls.s_ds_new = Submission().get_collection_handle().insert(p_dict)

        # create submission record for existing dspace
        p = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "fixtures", "dummy_dspace_submission.json")
        with open(p) as f:
            p_dict = json.loads(f.read())
        p_dict["bundle_meta"][0]["file_path"] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "fixtures",
            "fish.png")
        p_dict["bundle_meta"][0]["file_id"] = str(cls.d)
        p_dict["profile_id"] = str(cls.pid["_id"])
        p_dict["bundle"].append(str(cls.d))
        p_dict["meta"]["new_or_existing"] = "existing"
        # query for item id
        resp = requests.post("http://demo.dspace.org/rest/items")
        items = json.loads(resp.content.decode("utf-8"))
        item = items[0]
        p_dict["meta"]["identifier"] = item["uuid"]
        p_dict["item_id"] = item["uuid"]
        cls.s_ds_existing = Submission().get_collection_handle().insert(p_dict)
        cls.ckan_api = "http://demo.ckan.org/api/3/action/"
Esempio n. 37
0
 def update_targets_datafiles(self):
     bulk = DataFile().get_collection_handle().initialize_unordered_bulk_op()
     for k, v in self.targets_datafiles.items():
         bulk.find({'_id': ObjectId(k)}).update({'$set': {"description": v.get("description", dict())}})
     bulk.execute()
Esempio n. 38
0
    def extract_repo_fields(self, datafile_id=str(), repo=str()):
        """
        given a datafile id, and repository type function returns a list of dictionaries of fields matching the repo
        :param datafile_id:
        :param repo:
        :return:
        """

        from dal.copo_da import DataFile, CGCore
        from dal.copo_base_da import DataSchemas

        if not repo:  # no repository to filter by
            return list()

        repo_type_option = lkup.DROP_DOWNS["REPO_TYPE_OPTIONS"]
        repo_type_option = [
            x for x in repo_type_option if x["value"].lower() == repo.lower()
        ]

        if not repo_type_option:
            return list()

        repo_type_option = repo_type_option[0]

        cg_schema = DataSchemas("COPO").get_ui_template_node('cgCore')

        # filter schema items by repo
        cg_schema = [
            x for x in cg_schema
            if x.get("target_repo", str()).strip() != str()
            and repo_type_option.get("abbreviation", str()) in
            [y.strip() for y in x.get("target_repo").split(',')]
        ]

        record = DataFile().get_record(datafile_id)
        description = record.get("description", dict())

        attributes = description.get("attributes", dict())
        stages = description.get("stages", list())

        schema_df = pd.DataFrame(cg_schema)
        schema_df.id = schema_df.id.str.lower().str.split(".").str[-1]
        schema_df.index = schema_df.id
        schema_df = schema_df[['ref', 'id', 'prefix']]
        schema_df = schema_df[~schema_df['ref'].isna()]

        # get all stage items
        all_items = [item for st in stages for item in st.get("items", list())]

        # filter stage items - stage items should conform to specifications of the repo
        schema_ids = list(schema_df.id)
        items = {
            item.get("id", str()).lower().split(".")[-1]: st.get("ref",
                                                                 "").lower()
            for st in stages for item in st.get("items", list())
            if item.get("id", str()).lower().split(".")[-1] in schema_ids
        }

        # ...also, account for any filtering performed by client agents (e.g., dependencies in COPO Wizard),
        # within the context of the target repo
        schema_df = schema_df[schema_df.index.isin(items.keys())]

        # obtain attributes for filtered stage items
        target_stages = list(set(items.values()))
        datafile_attributes = [
            v for k, v in attributes.items() if k in target_stages
        ]

        new_dict = dict()
        for d in datafile_attributes:
            new_dict.update(d)

        new_dict_series = pd.Series(new_dict)
        new_dict_series.index = new_dict_series.index.str.lower()
        schema_df['vals'] = new_dict_series
        schema_df['vals'] = schema_df['vals'].fillna('')

        schema_df = schema_df[['ref', 'id', 'vals', 'prefix']]

        # get composite attributes
        composite_attrib = [
            x for x in all_items if x["id"] in list(schema_df.id)
            and x.get("create_new_item", False)
        ]

        # expand composite attributes
        for cattrib in composite_attrib:
            comp_series = schema_df.loc[cattrib["id"]]
            schema_df = schema_df[~schema_df.id.isin([cattrib["id"]])]
            children_schemas = [
                x for x in cg_schema if x.get("dependency", str()).lower() ==
                comp_series.ref.lower()
            ]

            accessions = comp_series.vals
            if isinstance(accessions, str):
                accessions = accessions.split(",")

            object_ids = [ObjectId(x) for x in accessions if x.strip()]

            records = list()
            if len(object_ids):
                records = cursor_to_list(CGCore().get_collection_handle().find(
                    {"_id": {
                        "$in": object_ids
                    }}))

            attr_list = list()
            for child in children_schemas:
                child_dict = dict(ref=child["ref"],
                                  id=child["id"].split(".")[-1],
                                  prefix=child["prefix"],
                                  vals=[])
                attr_list.append(child_dict)
                for rec in records:
                    child_dict["vals"].append(rec.get(child_dict["id"], str()))

            if attr_list:
                attr_df = pd.DataFrame(attr_list)
                attr_df.index = attr_df.id
                schema_df = pd.concat([schema_df, attr_df])

        schema_df.rename(index=str,
                         columns={
                             "ref": "dc",
                             "id": "copo_id"
                         },
                         inplace=True)

        dc_list = schema_df.to_dict('records')

        return dc_list
Esempio n. 39
0
def zip_file(request):
    # need to get a reference to the file to zip
    file_id = request.GET['file_id']
    print("zip started " + file_id)
    file_obj = ChunkedUpload.objects.get(pk=file_id)

    # get the name of the file to zip and change its suffix to .gz
    output_file_location = os.path.join(settings.MEDIA_ROOT,
                                        file_obj.file.name)
    output_file_name = file_obj.filename + '.gz'
    try:
        # open the file as gzip acrchive...set compression level
        temp_name = os.path.join(settings.MEDIA_ROOT,
                                 str(uuid.uuid4()) + '.tmp')
        myzip = gzip.open(temp_name, 'wb', compresslevel=1)
        src = open(output_file_location, 'r')

        # write input file to gzip archive in n byte chunks
        n = 100000000
        for chunk in iter(lambda: src.read(n), ''):
            myzip.write(bytes(chunk, 'UTF-8'))
    finally:
        myzip.close()
        src.close()

    print('zip complete ' + file_id)
    # now need to delete the old file and update the file record with the new file
    new_file_name = output_file_location + '.gz'
    os.rename(temp_name, new_file_name)
    os.remove(output_file_location)

    # calculate new file size
    stats = os.stat(new_file_name)
    new_file_size = stats.st_size / 1000 / 1000

    # update filename
    file_obj.filename = output_file_name
    file_obj.file.name = new_file_name

    # update file size
    file_obj.offset = stats.st_size
    file_obj.save()

    out = {
        'zipped': True,
        'file_name': output_file_name,
        'file_size': new_file_size
    }

    # update record in mongo
    record_object = DataFile().get_by_file_id(file_id)
    auto_fields = dict()
    auto_fields[DataFile().get_qualified_field(
        "file_size")] = u.filesize_toString(file_obj.offset)
    auto_fields[DataFile().get_qualified_field("name")] = output_file_name
    auto_fields[DataFile().get_qualified_field(
        "file_location")] = new_file_name

    profile_id = request.session['profile_id']
    component = "datafile"

    BrokerDA(target_id=str(record_object.get("_id", str())),
             component=component,
             auto_fields=auto_fields).do_save_edit()

    out = jsonpickle.encode(out)
    return HttpResponse(out, content_type='json')
Esempio n. 40
0
    def submit(self, sub_id, dataFile_ids=None):
        s = Submission().get_record(ObjectId(sub_id))

        if s["meta"]["new_or_existing"] == "new":
            # create and get item_id
            data = self._create_ckan_metadata(s)
            fullurl = self.host["url"] + "package_create"
            resp = requests.post(fullurl, json=data, headers=self.headers)
            if resp.status_code == 200:
                # package was created normally
                data = json.loads(resp.content.decode("utf-8"))
                dataset_id = data["result"]["id"]
                data = {"package_id": dataset_id}
                fullurl = self.host["url"] + "resource_create"
            elif resp.status_code == 400:
                instance = re.findall("https", fullurl)
                if len(instance) == 0:
                    fullurl = fullurl.replace("http", "https")
                resp = requests.post(fullurl, json=data, headers=self.headers)
                if resp.status_code != 200:
                    details = json.loads(resp.content.decode("utf-8"))
                    try:
                        msg = details["error"]["message"]
                    except KeyError:
                        msg = details["error"]["name"][0]

                    return json.dumps({
                        "status": resp.status_code,
                        "message": msg
                    })
                else:
                    data = json.loads(resp.content.decode("utf-8"))
                    dataset_id = data["result"]["id"]
                    data = {"package_id": dataset_id}
                    fullurl = self.host["url"] + "resource_create"
            elif resp.status_code == 409:
                # there is a conflict so update rather than create
                print(resp.reason)
                fullurl = self.host["url"] + "package_show"
                resp = requests.post(fullurl,
                                     json={"name_or_id": data["name"]})
                data = json.loads(resp.content.decode("utf-8"))
                dataset_id = data["result"]["id"]
                data = {"package_id": dataset_id}
                fullurl = self.host["url"] + "resource_create"
            else:
                return json.dumps({
                    "status": resp.status_code,
                    "message": resp.reason + " - " + resp.text
                })
        else:
            data = {"package_id": s["meta"]["identifier"]}

        # now we have a dataset id to which to add the datafile
        for f in s["bundle"]:

            # data = dict()
            df = DataFile().get_record(ObjectId(f))
            # upload file

            # get correct bitstream file extension lookup
            try:
                filename, file_extension = os.path.splitext(df["name"])
                if "." in file_extension:
                    file_extension = file_extension.replace(".", "")
                ext = self.get_media_type_from_file_ext(file_extension)
            except:
                ext = ""
            now = str(datetime.date.today())
            print(df["name"])
            data["name"] = df["name"]
            data["created"] = now
            data["mimetype"] = ext

            fullurl = self.host["url"] + "resource_create"
            url = parse.urlparse(self.host["url"])

            #data["url"] = urljoin(self.hostname, "dataset/" + str(uuid.uuid4()))

            with open(df["file_location"], 'rb') as f:
                files = [('upload', (df["name"], f, ext))]
                # data["upload"] = files
                try:
                    print(self.headers)

                    resp = requests.post(fullurl,
                                         data=data,
                                         files=files,
                                         headers=self.headers)
                    # print(resp.json()['headers'])
                except (TypeError, ValueError) as e:
                    print(e)
                    # for some reason this fails the first time
                    resp = requests.post(fullurl,
                                         data=data,
                                         files=files,
                                         headers=self.headers)
                except TypeError as t:
                    print(t)
                if resp.status_code == 200:
                    req = ThreadLocal.get_current_request()
                    details = json.loads(resp.content.decode("utf-8"))
                    details["result"]["repo_url"] = self.host["url"]
                    #details["result"]["url"] = req.build_absolute_uri("/") + "rest/get_accession_data?sub_id=" + sub_id
                    self._update_and_complete_submission(details, sub_id)
                elif resp.status_code == 400:
                    # try again checking for https
                    instance = re.findall("https", fullurl)
                    if len(instance) == 0:
                        fullurl = fullurl.replace("http", "https")
                    resp = requests.post(fullurl,
                                         data=data,
                                         files=f,
                                         headers=self.headers)
                    if resp.status_code != 200:
                        msg = json.loads(
                            resp.content.decode("utf-8"))["error"]["message"]
                        return {"status": resp.status_code, "message": msg}
                    details = json.loads(resp.content.decode("utf-8"))
                    details["result"]["repo_url"] = self.host["url"]
                    self._update_and_complete_submission(details, sub_id)
                elif resp.status_code == 409:
                    fullurl = self.host["url"] + "package_show"
                    resp = requests.post(fullurl, data={"id": dataset_id})
                    #  now iterate through resources to get matching name
                    resources = json.dumps(
                        resp.content.decode("utf-8"))["result"]["resources"]
                    fullurl = self.host["url"] + "resource_update"
                    # Submission().mark_submission_complete(ObjectId(sub_id))
                else:
                    return json.dumps({
                        "status":
                        resp.status_code,
                        "message":
                        resp.reason + " - " + resp.text
                    })

        Submission().mark_submission_complete(ObjectId(sub_id))
        return True