def releasinator(self, name, identifier, status):
     '''releases objects into their equivalent released states'''
     patch_dict = {}
     if name in self.current:
         log = '%s' % "UPDATING: {} {} with status {} ".format(
             name, identifier, status) + \
             "is now current"
         patch_dict = {"status": "current"}
     elif name in self.finished:
         log = '%s' % "UPDATING: {} {} with status {} ".format(
             name, identifier, status) + \
             "is now finished"
         patch_dict = {"status": "finished"}
     else:
         log = "UPDATING: {} {} with status {} ".format(
             name, identifier, status) + \
             "is now released"
         patch_dict = {"status": "released"}
     if name in self.date_released:
         # if the object would have a date_released give it one
         now = datetime.datetime.now().date()
         patch_dict = {"date_released": str(now), "status": "released"}
         log += " with date {}".format(now)
     logger.info('%s' % log)
     if self.PRINTALL:
         print (log)
     encodedcc.patch_ENCODE(identifier, self.connection, patch_dict)
 def releasinator(self, name, identifier, status):
     '''releases objects into their equivalent released states'''
     patch_dict = {}
     if name in self.current:
         log = '%s' % "UPDATING: {} {} with status {} ".format(
             name, identifier, status) + \
             "is now current"
         patch_dict = {"status": "current"}
     elif name in self.finished:
         log = '%s' % "UPDATING: {} {} with status {} ".format(
             name, identifier, status) + \
             "is now finished"
         patch_dict = {"status": "finished"}
     else:
         log = "UPDATING: {} {} with status {} ".format(
             name, identifier, status) + \
             "is now released"
         patch_dict = {"status": "released"}
     if name in self.date_released:
         # if the object would have a date_released give it one
         now = datetime.datetime.now().date()
         patch_dict = {"date_released": str(now), "status": "released"}
         log += " with date {}".format(now)
     logger.info('%s' % log)
     if self.PRINTALL:
         print(log)
     encodedcc.patch_ENCODE(identifier, self.connection, patch_dict)
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    profiles = encodedcc.get_ENCODE('/profiles/', connection)
    for object_type in profiles.keys():
        profile_properties = encodedcc.get_ENCODE(
            '/profiles/' + object_type, connection).get('properties')
        # we should fix only objects that have alternate accessions property
        if profile_properties and profile_properties.get(
                'alternate_accessions'):
            uuid_2_alternate_accessions = {}
            objects = encodedcc.get_ENCODE('search/?type=' + object_type,
                                           connection)['@graph']
            for entry in objects:
                if entry.get('alternate_accessions'):
                    replaced_objects_accessions = []
                    for acc in entry.get('alternate_accessions'):
                        replaced_objects_accessions.extend(
                            retreive_list_of_replaced(acc,
                                                      connection))
                    if sorted(list(set(
                        replaced_objects_accessions))) != sorted(
                       entry.get('alternate_accessions')):
                        uuid_2_alternate_accessions[entry['uuid']] = \
                            set(replaced_objects_accessions)

            for uuid in uuid_2_alternate_accessions.keys():
                uuid_sets_counter = 0
                for key in uuid_2_alternate_accessions.keys():
                    if uuid_2_alternate_accessions[uuid] <= \
                       uuid_2_alternate_accessions[key]:
                        uuid_sets_counter += 1
                if uuid_sets_counter == 1:
                    for acc in list(uuid_2_alternate_accessions[uuid]):
                        to_clean_objects = encodedcc.get_ENCODE(
                            'search/?type=Item&accession=' + acc,
                            connection)['@graph']
                        for object_to_clean in to_clean_objects:
                            print(object_to_clean['uuid'] +
                                  ' alternate accessions list ' +
                                  str(object_to_clean[
                                      'alternate_accessions']) +
                                  ' is removed')
                            encodedcc.patch_ENCODE(
                                object_to_clean['uuid'],
                                connection,
                                {"alternate_accessions": []})

                    print(uuid + ' is patched with ' +
                          str({"alternate_accessions": list(
                              uuid_2_alternate_accessions[uuid])}))
                    encodedcc.patch_ENCODE(
                        uuid,
                        connection,
                        {"alternate_accessions": list(
                            uuid_2_alternate_accessions[uuid])})
 def updater(self, exp, con):
     ''' helper function runs the update step'''
     temp = encodedcc.get_ENCODE(exp, self.connection).get("controlled_by", [])
     if con not in temp:
         control = temp + [con]
         patch_dict = {"controlled_by": control}
         print("patching experiment file {} with controlled_by {}".format(exp, con))
         encodedcc.patch_ENCODE(exp, self.connection, patch_dict)
     else:
         print("ERROR: controlled_by for experiment file {} already contains {}".format(exp, con))
def file_manager(key, value, connection, obj_type):
    filename = key.split("/")[-1]
    print("Downloading {}".format(filename))
    r = requests.get(key)
    with open(filename, "wb") as outfile:
        outfile.write(r.content)
    if obj_type == "Biosample":
        filepart = filename.split("-")[0]
    else:
        filepart = filename.split("-")[1]

    attach = attachment(filename)
    temp = "_".join(key.split("/")[-2:])
    aliases = ["brenton-graveley:" + temp]

    if (encodedcc.get_ENCODE(quote(aliases[0]),
                             connection)['status']) != 'error':

        removing_patch = {'status': 'deleted', 'aliases': []}
        print('DELETING ' + aliases[0] + ' ' + str(removing_patch))
        encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch)

    upload = {
        "aliases":
        aliases,
        "attachment":
        attach,
        "award":
        "U54HG007005",
        "document_type":
        "general protocol",
        "lab":
        "/labs/brenton-graveley/",
        "status":
        "released",
        "description":
        "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format(
            obj_type=obj_type, filepart=filepart),
    }

    print("Uploading {} as {}".format(filename, aliases[0]))

    encodedcc.new_ENCODE(connection, "Document", upload)

    print("Patching {} with document {}".format(value, aliases[0]))
    if obj_type == "Biosample":
        docs = {"protocol_documents": aliases}
    else:
        docs = {"documents": aliases}

    encodedcc.patch_ENCODE(quote(value), connection, docs)

    print("Removing document {}".format(filename))
    subprocess.run(["rm", filename])
    '''
 def check_ENCODE(self, idList, connection, otherIdList=[], bothDicts={}):
     for pmid in idList:
         extraData = bothDicts.get(pmid)
         ENCODEvalue = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=PMID:" + pmid, connection)
         if ENCODEvalue.get("@graph"):
             log = "PMID " + pmid + " is listed in ENCODE"
             logger.info('%s' % log)
             uuid = ENCODEvalue.get("@graph")[0].get("uuid")
             if not self.CREATE_ONLY:
                 self.compare_entrez_ENCODE(uuid, pmid, connection, extraData)
         else:
             if self.CREATE_ONLY:
                 self.get_entrez([pmid])
             titleEntrez = self.entrezDict[pmid].get("title")
             found = False
             for otherID in otherIdList:
                 titleENCODE = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=" + otherID, connection)
                 if titleENCODE.get("title") == titleEntrez:
                     log = pmid + " is in ENCODE by a different name " + titleENCODE.get("uuid")
                     logger.warning('%s' % log)
                     self.compare_entrez_ENCODE(titleENCODE.get("uuid"), pmid, connection, extraData)
                     if self.UPDATE:
                         newIdent = titleENCODE.get("identifiers")
                         newIdent.append("PMID:" + pmid)
                         patch_dict = {"identifiers": newIdent}
                         encodedcc.patch_ENCODE(titleENCODE.get("uuid"), connection, patch_dict)
                     found = True
             if found is False:
                 log = "This publication is not listed in ENCODE " + pmid
                 logger.warning('%s' % log)
                 if self.CREATE:
                     self.POST_COUNT += 1
                     pmidData = self.entrezDict[pmid]
                     log = "POSTing the new object: " + pmid
                     logger.info('%s' % log)
                     post_dict = {
                         "title": pmidData.get("title"),
                         "abstract": pmidData.get("abstract"),
                         "submitted_by": "/users/8b1f8780-b5d6-4fb7-a5a2-ddcec9054288/",
                         "lab": "/labs/encode-consortium/",
                         "award": "/awards/ENCODE/",
                         "categories": extraData.get("categories"),
                         "published_by": extraData.get("published_by"),
                         "date_published": pmidData.get("date_published"),
                         "authors": pmidData.get("authors"),
                         "identifiers": ["PMID:" + pmid],
                         "journal": pmidData.get("journal"),
                         "volume": pmidData.get("volume"),
                         "issue": pmidData.get("issue"),
                         "page": pmidData.get("page"),
                         "status": "published"
                     }
                     if extraData.get("data_used"):
                         post_dict["data_used"] = extraData.get("data_used")
                     encodedcc.new_ENCODE(connection, "publications", post_dict)
 def check_ENCODE(self, idList, connection, otherIdList=[], bothDicts={}):
     for pmid in idList:
         extraData = bothDicts.get(pmid)
         ENCODEvalue = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=PMID:" + pmid, connection)
         if ENCODEvalue.get("@graph"):
             log = "PMID " + pmid + " is listed in ENCODE"
             logger.info('%s' % log)
             uuid = ENCODEvalue.get("@graph")[0].get("uuid")
             if not self.CREATE_ONLY:
                 self.compare_entrez_ENCODE(uuid, pmid, connection, extraData)
         else:
             if self.CREATE_ONLY:
                 self.get_entrez([pmid])
             titleEntrez = self.entrezDict[pmid].get("title")
             found = False
             for otherID in otherIdList:
                 titleENCODE = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=" + otherID, connection)
                 if titleENCODE.get("title") == titleEntrez:
                     log = pmid + " is in ENCODE by a different name " + titleENCODE.get("uuid")
                     logger.warning('%s' % log)
                     self.compare_entrez_ENCODE(titleENCODE.get("uuid"), pmid, connection, extraData)
                     if self.UPDATE:
                         newIdent = titleENCODE.get("identifiers")
                         newIdent.append("PMID:" + pmid)
                         patch_dict = {"identifiers": newIdent}
                         encodedcc.patch_ENCODE(titleENCODE.get("uuid"), connection, patch_dict)
                     found = True
             if found is False:
                 log = "This publication is not listed in ENCODE " + pmid
                 logger.warning('%s' % log)
                 if self.CREATE:
                     self.POST_COUNT += 1
                     pmidData = self.entrezDict[pmid]
                     log = "POSTing the new object: " + pmid
                     logger.info('%s' % log)
                     post_dict = {
                         "title": pmidData.get("title"),
                         "abstract": pmidData.get("abstract"),
                         "submitted_by": "/users/8b1f8780-b5d6-4fb7-a5a2-ddcec9054288/",
                         "lab": "/labs/encode-consortium/",
                         "award": "/awards/ENCODE/",
                         "categories": extraData.get("categories"),
                         "published_by": extraData.get("published_by"),
                         "date_published": pmidData.get("date_published"),
                         "authors": pmidData.get("authors"),
                         "identifiers": ["PMID:" + pmid],
                         "journal": pmidData.get("journal"),
                         "volume": pmidData.get("volume"),
                         "issue": pmidData.get("issue"),
                         "page": pmidData.get("page"),
                         "status": "published"
                     }
                     if extraData.get("data_used"):
                         post_dict["data_used"] = extraData.get("data_used")
                     encodedcc.new_ENCODE(connection, "publications", post_dict)
def excel_reader(datafile, sheet, update, connection, patchall):
    row = reader(datafile, sheetname=sheet)
    keys = next(row)  # grab the first row of headers
    total = 0
    error = 0
    success = 0
    patch = 0
    for values in row:
        total += 1
        post_json = dict(zip(keys, values))
        post_json = dict_patcher(post_json)
        # add attchments here
        if post_json.get("attachment"):
            attach = attachment(post_json["attachment"])
            post_json["attachment"] = attach
        print(post_json)
        temp = {}
        if post_json.get("uuid"):
            temp = encodedcc.get_ENCODE(post_json["uuid"], connection)
        elif post_json.get("aliases"):
            temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]),
                                        connection)
        elif post_json.get("accession"):
            temp = encodedcc.get_ENCODE(post_json["accession"], connection)
        elif post_json.get("@id"):
            temp = encodedcc.get_ENCODE(post_json["@id"], connection)
        if temp.get("uuid"):
            if patchall:
                e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
                    patch += 1
            else:
                print("Object {} already exists.  Would you like to patch it instead?".format(temp["uuid"]))
                i = input("PATCH? y/n ")
                if i.lower() == "y":
                    e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json)
                    if e["status"] == "error":
                        error += 1
                    elif e["status"] == "success":
                        success += 1
                        patch += 1
        else:
            if update:
                print("POSTing data!")
                e = encodedcc.new_ENCODE(connection, sheet, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
    print("{sheet}: {success} out of {total} posted, {error} errors, {patch} patched".format(
        sheet=sheet.upper(), success=success, total=total, error=error, patch=patch))
 def updater(self, exp, con):
     ''' helper function runs the update step'''
     temp = encodedcc.get_ENCODE(exp,
                                 self.connection).get("controlled_by", [])
     if con not in temp:
         control = temp + [con]
         patch_dict = {"controlled_by": control}
         print("patching experiment file {} with controlled_by {}".format(
             exp, con))
         encodedcc.patch_ENCODE(exp, self.connection, patch_dict)
     else:
         print(
             "ERROR: controlled_by for experiment file {} already contains {}"
             .format(exp, con))
def renamer(file, connection, update):
    patch_dict = {}
    aliases = file.get("aliases", [])
    submitted = file.get("submitted_file_name", "").rstrip("_rm")
    submitted = submitted + "_rm"
    patch_dict["submitted_file_name"] = submitted
    if any(aliases):
        alias = aliases[0].rstrip("_replaced")
        alias = [alias + "_replaced"]
        patch_dict["aliases"] = alias
    else:
        print("skipping {} with no aliases".format(file["@id"]))
    print("file {} with data {}".format(file["@id"], patch_dict))
    if update:
        encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
def renamer(file, connection, update):
    patch_dict = {}
    aliases = file.get("aliases", [])
    submitted = file.get("submitted_file_name", "").rstrip("_rm")
    submitted = submitted + "_rm"
    patch_dict["submitted_file_name"] = submitted
    if any(aliases):
        alias = aliases[0].rstrip("_replaced")
        alias = [alias + "_replaced"]
        patch_dict["aliases"] = alias
    else:
        print("skipping {} with no aliases".format(file["@id"]))
    print("file {} with data {}".format(file["@id"], patch_dict))
    if update:
        encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
def file_manager(key, value, connection, obj_type):
    filename = key.split("/")[-1]
    print("Downloading {}".format(filename))
    r = requests.get(key)
    with open(filename, "wb") as outfile:
        outfile.write(r.content)
    if obj_type == "Biosample":
        filepart = filename.split("-")[0]
    else:
        filepart = filename.split("-")[1]

    attach = attachment(filename)
    temp = "_".join(key.split("/")[-2:])
    aliases = ["brenton-graveley:" + temp]

    if (encodedcc.get_ENCODE(quote(aliases[0]), connection)['status']) != 'error':

        removing_patch = {'status':'deleted',
                          'aliases': []}
        print ('DELETING ' + aliases[0] + ' ' + str(removing_patch))
        encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch)


    upload = {"aliases": aliases,
              "attachment": attach,
              "award": "U54HG007005",
              "document_type": "general protocol",
              "lab": "/labs/brenton-graveley/",
              "status": "released",
              "description": "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format(obj_type=obj_type, filepart=filepart),
              }

    print("Uploading {} as {}".format(filename, aliases[0]))

    encodedcc.new_ENCODE(connection, "Document", upload)

    print("Patching {} with document {}".format(value, aliases[0]))
    if obj_type == "Biosample":
        docs = {"protocol_documents": aliases}
    else:
        docs = {"documents": aliases}
    
    encodedcc.patch_ENCODE(quote(value), connection, docs)

    print("Removing document {}".format(filename))
    subprocess.run(["rm", filename])
    
    '''
def uploader(file_object, update):
    aws_return_code = encodedcc.upload_file(file_object, update)
    if aws_return_code:
        logger.warning('Row %d: Non-zero AWS upload return code %d' % (aws_return_code))
        print("Retrying upload to S3...")
        creds = file_object["upload_credentials"]
        expire = parse(creds["expiration"]).date()
        now = datetime.datetime.now().date()
        if now > expire:
            new_file_object = encodedcc.ENC_Item(connection, file_object["@id"])
            print("Your upload credentials are stale.  Getting new credentials.")
            file_object = new_file_object.new_creds()

        aws_retry = encodedcc.upload_file(file_object, update)
        if aws_retry:
            logger.warning('Row %d: Non-zero AWS upload return code %d' % (aws_retry))
            encodedcc.patch_ENCODE(file_object["@id"], connection, {"status": "upload failed"})
    return aws_return_code
def replacer(file, connection, update):
    if file.get("aliases"):
        # this has aliases
        if file["aliases"][0].endswith("_replaced"):
            # this is one of the old ones
            alias = file["aliases"][0].rstrip("_replaced")
            old_acc = file["accession"]
            old_date = file["date_created"]
            print(old_acc)
            new = encodedcc.get_ENCODE(quote(alias), connection)
            new_acc = new["accession"]
            new_date = new["date_created"]
            patch_dict = {"status": "replaced", "alternate_accessions": [alias]}
            #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date))
            if update:
                encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
    else:
        print("file {} has no aliases".format(file["@id"]))
def replacer(file, connection, update):
    if file.get("aliases"):
        # this has aliases
        if file["aliases"][0].endswith("_replaced"):
            # this is one of the old ones
            alias = file["aliases"][0].rstrip("_replaced")
            old_acc = file["accession"]
            old_date = file["date_created"]
            print(old_acc)
            new = encodedcc.get_ENCODE(quote(alias), connection)
            new_acc = new["accession"]
            new_date = new["date_created"]
            patch_dict = {
                "status": "replaced",
                "alternate_accessions": [alias]
            }
            #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date))
            if update:
                encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
    else:
        print("file {} has no aliases".format(file["@id"]))
Beispiel #16
0
def uploader(file_object, update):
    aws_return_code = encodedcc.upload_file(file_object, update)
    if aws_return_code:
        logger.warning('Row %d: Non-zero AWS upload return code %d' %
                       (aws_return_code))
        print("Retrying upload to S3...")
        creds = file_object["upload_credentials"]
        expire = parse(creds["expiration"]).date()
        now = datetime.datetime.now().date()
        if now > expire:
            new_file_object = encodedcc.ENC_Item(
                connection, file_object["@id"])
            print("Your upload credentials are stale.  Getting new credentials.")
            file_object = new_file_object.new_creds()

        aws_retry = encodedcc.upload_file(file_object, update)
        if aws_retry:
            logger.warning(
                'Row %d: Non-zero AWS upload return code %d' % (aws_retry))
            encodedcc.patch_ENCODE(file_object["@id"], connection, {
                                   "status": "upload failed"})
    return aws_return_code
def excel_reader(datafile, sheet, update, connection, patchall):
    row = reader(datafile, sheetname=sheet)
    keys = next(row)  # grab the first row of headers
    total = 0
    error = 0
    success = 0
    patch = 0
    json_properties = encodedcc.get_ENCODE('/profiles/{}.json'.format(sheet),
                                           connection)['properties']
    new_accessions_aliases = []
    failed_postings = []
    for values in row:
        total += 1
        post_json = dict(zip(keys, values))
        post_json = dict_patcher(post_json)
        post_json = expose_objects(post_json, json_properties)
        # add attchments here
        if post_json.get("attachment"):
            attach = attachment(post_json["attachment"])
            post_json["attachment"] = attach
        print(post_json)
        temp = {}
        # Silence get_ENCODE failures.
        with encodedcc.print_muted():
            if post_json.get("uuid"):
                temp = encodedcc.get_ENCODE(post_json["uuid"], connection)
            elif post_json.get("aliases"):
                temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]),
                                            connection)
            elif post_json.get("accession"):
                temp = encodedcc.get_ENCODE(post_json["accession"], connection)
            elif post_json.get("@id"):
                temp = encodedcc.get_ENCODE(post_json["@id"], connection)
        if temp.get("uuid"):
            if patchall:
                e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
                    patch += 1
            else:
                print(
                    "Object {} already exists.  Would you like to patch it instead?"
                    .format(temp["uuid"]))
                i = input("PATCH? y/n ")
                if i.lower() == "y":
                    e = encodedcc.patch_ENCODE(temp["uuid"], connection,
                                               post_json)
                    if e["status"] == "error":
                        error += 1
                    elif e["status"] == "success":
                        success += 1
                        patch += 1
        else:
            if update:
                print("POSTing data!")
                e = encodedcc.new_ENCODE(connection, sheet, post_json)
                if e["status"] == "error":
                    error += 1
                    failed_postings.append(
                        post_json.get('aliases', 'alias not specified'))
                elif e["status"] == "success":
                    new_object = e['@graph'][0]
                    # Print now and later.
                    print('New accession/UUID: {}'.format(
                        (new_object.get('accession', new_object.get('uuid')))))
                    new_accessions_aliases.append(
                        (new_object.get('accession', new_object.get('uuid')),
                         new_object.get('aliases')))
                    success += 1
    print(
        "{sheet}: {success} out of {total} posted, {error} errors, {patch} patched"
        .format(sheet=sheet.upper(),
                success=success,
                total=total,
                error=error,
                patch=patch))
    if new_accessions_aliases:
        print('New accession/UUID and alias:' if len(new_accessions_aliases) ==
              1 else 'New accessions/UUIDs and aliases:')
        for (accession, alias) in new_accessions_aliases:
            if len(alias) == 0:
                alias = 'alias not specified'
            else:
                alias = ', '.join(alias) if isinstance(alias, list) else alias
            print(accession, alias)
    if failed_postings:
        print('Posting failed for {} object(s):'.format(len(failed_postings)))
        for alias in failed_postings:
            print(', '.join(alias) if isinstance(alias, list) else alias)
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on", connection.server)
    if args.update:
        assert args.user, "A user must be provided to run this script!"
        user = encodedcc.get_ENCODE(args.user, connection).get("@id")
        assert user, "{} was not found in the ENCODE database as a registered user. Please try again".format(args.user)

    data = []
    idList = []
    with open(args.infile, "r") as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        for row in reader:
            data.append(row)
    for item in data:
        lanes = item.get("lanes", "")
        lanes = list(set(lanes.split(",")))
        item["lanes"] = lanes
        if not any(item["notes"]):
            item.pop("notes")
        if item.get("@id") not in idList:
            idList.append(item["@id"])
    objDict = {key: [] for key in idList}
    for item in data:
        objDict.get(item.get("@id", ""), "").append(item)

    for idNum in objDict.keys():
        antibody = encodedcc.get_ENCODE(idNum, connection, frame="edit")
        new_antibody = {}
        if antibody.get("primary_characterization_method"):
            reviews = antibody.get("characterization_reviews", [])
            enc_docs = antibody.get("documents", [])
            file_docs = []
            for obj in objDict[idNum]:
                if obj.get("documents"):
                    for doc in obj["documents"].split(","):
                        file_docs.append(doc)
                if obj.get("notes"):
                    new_antibody["notes"] = obj["notes"]
            for doc in file_docs:
                if ":" in doc:
                    doc = quote(doc)
                link = encodedcc.get_ENCODE(doc, connection).get("@id")
                if link:
                    if link not in enc_docs:
                        enc_docs.append(link)

            #######################
            # begin lanes checking
            #######################
            enc_lanes_check = []
            file_lanes_check = []
            flag = False
            for r in reviews:
                enc_lanes_check.append(r["lane"])
            for item in objDict[idNum]:
                for l in item["lanes"]:
                    file_lanes_check.append(int(l))
            if len(set(enc_lanes_check)) < len(enc_lanes_check):
                # duplicate lanes in ENCODE
                print("Possible duplicate lanes in ENCODE")
                flag = True
            if len(set(file_lanes_check)) < len(file_lanes_check):
                # duplicate lanes in file
                print("Possible duplicate lanes in file")
                flag = True
            if len(set(enc_lanes_check) - set(file_lanes_check)) > 0:
                # more lanes in ENCODE than in file
                print("Found lanes in ENCODE not in the file")
                flag = True
            if len(set(file_lanes_check) - set(enc_lanes_check)) > 0:
                # more lanes in file than in ENCODE
                print("Found lanes in the file not in ENCODE")
                flag = True
            if flag:
                print("Some problem was found with the number of lanes in the file as compared to ENCODE")
                print("Do you want to continue running the program or exit and check the data?")
                i = input("Continue? y/n ")
                assert i.upper() == "Y"
                # exit the script
            for r in reviews:
                for line in objDict[idNum]:
                    for lane in line["lanes"]:
                        if int(lane) == r["lane"]:
                            if line["lane_status"].lower() == "pending dcc review":
                                print("can't set to pending review, need manual override")
                                fin = input("Change the status to 'pending dcc review'? y/n ")
                                if fin.upper() == "Y":
                                    r["lane_status"] = line["lane_status"].lower()
                                    for link in enc_docs:
                                        if encodedcc.get_ENCODE(link, connection).get("document_type", "") == "standards document":
                                            enc_docs.pop(link)
                                else:
                                    pass
                            else:
                                r["lane_status"] = line["lane_status"].lower()
            # now all lanes in reviews should be updated to document
            enc_comp = 0
            enc_ncomp = 0
            other = 0

            for r in reviews:
                if r.get("lane_status", "") == "compliant":
                    enc_comp = enc_comp + 1
                elif r.get("lane_status", "") == "not compliant":
                    enc_ncomp = enc_ncomp + 1
                else:
                    other = other + 1
            if other > 0:
                print("not all lanes have allowed status, antibody characterization status set to not compliant")
                new_antibody["status"] = "not compliant"
            elif enc_comp > 0:
                new_antibody["status"] = "compliant"
            elif other == 0 and enc_comp == 0 and enc_ncomp > 0:
                new_antibody["status"] = "not compliant"
            ######################
            # end lanes checking
            ######################

            if antibody.get("lab", "") == "/labs/michael-snyder/":
                # make sure special document is added if not in the file
                if "michael-snyder:biorad_protein_standard" not in file_docs:
                    file_docs.append("michael-snyder:biorad_protein_standard")
                if antibody["primary_characterization_method"] == "immunoprecipitation":
                    if len(reviews) == 1:
                        # fix lane number
                        reviews[0]["lane"] = 3

            new_antibody["characterization_reviews"] = reviews
            new_antibody["documents"] = enc_docs
            if args.update:
                new_antibody["reviewed_by"] = user

        if args.update:
            print("PATCHing antibody characterization", idNum)
            encodedcc.patch_ENCODE(idNum, connection, new_antibody)
        else:
            print("PATCH data:", new_antibody)
def main():
    args = getArgs()
    outfile = args.outfile
    CREATE_ONLY = args.createonly
    UPDATE_ONLY = args.updateonly
    Entrez.email = args.email
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)

    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    print("Running on ", connection.server)

    publication = PublicationUpdate(args)

    if not UPDATE_ONLY:
        publication.setup_publication()
        pmidList = publication.consortium_ids + publication.community_ids
        mergeDicts = publication.consortium_dict.copy()
        mergeDicts.update(publication.community_dict)  # holds published_by, categories, and data_used

        if not CREATE_ONLY:
            publication.get_entrez(pmidList)

        community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID = publication.find_ENCODE_extras(publication.community_ids, publication.consortium_ids, connection)
        total_ENCODE_only = len(community_ENCODE_Only) + len(consortium_ENCODE_Only)
        allOtherIDs = communityOtherID + consortiumOtherID
        publication.check_ENCODE(pmidList, connection, allOtherIDs, mergeDicts)
        log = str(total_ENCODE_only) + " items in ENCODE but not in files"
        logger.info('%s' % log)
        log = str(publication.PATCH_COUNT) + " publication files PATCHed"
        logger.info('%s' % log)
        log = str(publication.POST_COUNT) + " publication files POSTed"
        logger.info('%s' % log)
        print("Results printed to", outfile)
    else:
        infile = UPDATE_ONLY
        with open(infile, 'r') as readfile:
            uuidList = [x.rstrip('\n') for x in readfile]
        # check each publication to see if it has a PMID, if it does add it to the PMIDlist
        # if it does not have one look it up on Entrez
        pmid_uuid_dict = {}
        for uuid in uuidList:
            pub = encodedcc.get_ENCODE(uuid, connection)
            title = pub.get("title", "")
            identifiers = pub.get("identifiers", [])
            found = False
            for i in identifiers:
                if "PMID:" in i:
                    p = i.split(":")[1]
                    found = True
            if found:
                pmid_uuid_dict[p] = uuid
            else:
                # search Entrez for publication by title
                handle = Entrez.esearch(db="pubmed", term=title)
                record = Entrez.read(handle)
                idlist = record["IdList"]
                if len(idlist) > 1:
                    log = "More than one possible PMID found for " + uuid
                    logger.error('%s' % log)
                    log = str(idlist) + " are possible PMIDs"
                    logger.error('%s' % log)
                elif len(idlist) == 0:
                    log = "No possible PMID found for " + uuid
                    logger.error('%s' % log)
                else:
                    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
                    records = Medline.parse(handle)
                    # save the records, you can convert them to a list
                    records = list(records)
                    for record in records:
                        pm = record.get("PMID")
                        ti = record.get("TI")
                        log = "Publication " + uuid + " with title \"" + title + "\" matches PMID:" + pm + " with title \"" + ti + "\""
                        logger.info('%s' % log)
                        identifiers.append("PMID:" + pm)
                        encodedcc.patch_ENCODE(uuid, connection, {"identifiers": identifiers})
                        pmid_uuid_dict[pm] = uuid
        pmidList = list(pmid_uuid_dict.keys())
        publication.get_entrez(pmidList)
        with open("pub_update.txt", "w") as f:
            for pmid in pmid_uuid_dict.keys():
                publication.compare_entrez_ENCODE(pmid_uuid_dict[pmid], pmid, connection)
            f.write(str(len(pmid_uuid_dict.keys())) + " publications checked " + str(publication.PATCH_COUNT) + " publications PATCHed")
 def compare_entrez_ENCODE(self, uuid, pmid, connection, extraData={}):
     '''compares value in ENCODE database to results from Entrez
     '''
     encode = encodedcc.get_ENCODE(uuid, connection)
     entrez = self.entrezDict.get(pmid)
     patch = False
     if not entrez:
         log = "PMID " + pmid + " was not found in Entrez database!!"
         logger.warning('%s' % log)
     else:
         log = "PMID " + pmid
         logger.info('%s' % log)
         for key in entrez.keys():
             if key in encode.keys():
                 if entrez[key] == encode[key]:
                     log = "entrez key \"" + key + "\" matches encode key"
                     logger.info('%s' % log)
                 else:
                     log = "\"" + key + "\" value in encode database does not match value in entrez database"
                     logger.warning('%s' % log)
                     log = "\tENTREZ: " + entrez[key] + "\n\tENCODE: " + encode[key]
                     logger.warning('%s' % log)
                     if self.UPDATE or self.UPDATE_ONLY:
                         log = "PATCH in the new value for \"" + key + "\""
                         logger.info('%s' % log)
                         patch_dict = {key: entrez[key]}
                         encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                         patch = True
             else:
                 log = "ENCODE missing \"" + key + "\" from Entrez.  New key and value must be added"
                 logger.warning('%s' % log)
                 if self.UPDATE or self.UPDATE_ONLY:
                     log = "PATCHing in new key \"" + key + "\""
                     logger.info('%s' % log)
                     patch_dict = {key: entrez[key]}
                     encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                     patch = True
         if not self.UPDATE_ONLY:
             for key in extraData.keys():
                 if type(extraData.get(key)) is list:
                     if set(encode.get(key, [])) == set(extraData.get(key, [])):
                         log = "encode \"" + key + "\" matches data in file"
                         logger.info('%s' % log)
                     else:
                         log = "encode \"" + key + "\" value" + str(encode.get(key, [])) + "does not match file"
                         logger.warning('%s' % log)
                         if self.UPDATE:
                             if any(extraData[key]):
                                 patch_dict = {key: extraData[key]}
                                 encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                                 patch = True
                             else:
                                 log = "No value in file to input for \"" + key + "\""
                                 logger.warning('%s' % log)
                 if type(extraData.get(key)) is str:
                     if encode.get(key, "") == extraData.get(key, ""):
                         log = "encode \"" + key + "\" matches data in file"
                         logger.info('%s' % log)
                     else:
                         log = "encode \"" + key + "\" value" + str(encode.get(key, "")) + "does not match file"
                         logger.warning('%s' % log)
                         if self.UPDATE:
                             patch_dict = {key: extraData[key]}
                             encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                             patch = True
         if encode.get("status", "") != "published" and (self.UPDATE or self.UPDATE_ONLY):
             log = "Setting status to published"
             logger.info('%s' % log)
             encodedcc.patch_ENCODE(uuid, connection, {"status": "published"})
             patch = True
         if patch is True:
             self.PATCH_COUNT += 1
Beispiel #21
0
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument(
        '--infile',
        '-i',
        help="File containing the JSON object as a JSON string.")
    parser.add_argument('--server', help="Full URL of the server.")
    parser.add_argument('--key',
                        default='default',
                        help="The keypair identifier from the keyfile.  \
                        Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --keyfile\
                        =%s" % (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--authid', help="The HTTP auth ID.")
    parser.add_argument('--authpw', help="The HTTP auth PW.")
    parser.add_argument(
        '--force-put',
        default=False,
        action='store_true',
        help="Force the object to be PUT rather than PATCHed.  \
                        Default is False.")
    parser.add_argument('--get-only',
                        default=False,
                        action='store_true',
                        help="Do nothing but get the object and print it.  \
                        Default is False.")
    parser.add_argument('--id', help="URI for an object"),
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")
    parser.add_argument(
        '--frame',
        help=
        "define a frame to get back the JSON object, for use with --id. Default is frame=object",
        default="object")
    parser.add_argument('--type', help="the object's type")
    parser.add_argument(
        '--update',
        default=False,
        action='store_true',
        help="Let the script PATCH/POST the data.  Default is False")
    args = parser.parse_args()

    global DEBUG_ON
    DEBUG_ON = args.debug

    if args.get_only:
        GET_ONLY = True
    else:
        GET_ONLY = False

    key = encodedcc.ENC_Key(args.keyfile, args.key)
    if args.server and args.authpw and args.authid:
        key.server = args.server
        key.authid = args.authid
        key.authpw = args.authpw
        print("Creating authorization data from command line inputs")
    connection = encodedcc.ENC_Connection(key)
    print("Running on {}".format(connection.server))
    if args.update:
        print(
            "This is an UPDATE run! Data will be PATCHed or POSTed accordingly"
        )
    else:
        print("This is a dry run, no data will be changed")

    new_object = False
    if args.id:
        GET_ONLY = True
        print("Taking id to get from --id")
        new_json = {}
        uuid_response = {}
        accession_response = {}
        try:
            id_response = encodedcc.get_ENCODE(args.id,
                                               connection,
                                               frame=args.frame)
        except:
            id_response = {}
            new_object = True
    else:

        if args.infile:
            infile = open(args.infile, 'r')
        else:
            infile = sys.stdin

        new_json_string = infile.read()

        new_json = json.loads(new_json_string)
        if args.debug:
            encodedcc.pprint_ENCODE(new_json)
        if '@id' in new_json:
            id_response = encodedcc.get_ENCODE(new_json['@id'], connection)
            if id_response.get("code") == 404:
                id_response = {}
                new_object = True
        else:
            id_response = {}
            new_object = True
        if 'uuid' in new_json:
            uuid_response = encodedcc.get_ENCODE(new_json['uuid'], connection)
            if uuid_response.get("code") == 404:
                uuid_response = {}
                new_object = True
        else:
            uuid_response = {}
            new_object = True
        if 'accession' in new_json:
            accession_response = encodedcc.get_ENCODE(new_json['accession'],
                                                      connection)
            if accession_response.get("code") == 404:
                accession_response = {}
                new_object = True
        else:
            accession_response = {}
            new_object = True

        if new_object:
            print(
                "No identifier in new JSON object.  Assuming POST or PUT with auto-accessioning."
            )

    object_exists = False
    if id_response:
        object_exists = True
        print("Found matching @id:")
        encodedcc.pprint_ENCODE(id_response)
    if uuid_response:
        object_exists = True
        print("Found matching uuid:")
        encodedcc.pprint_ENCODE(uuid_response)
    if accession_response:
        object_exists = True
        print("Found matching accession")
        encodedcc.pprint_ENCODE(accession_response)

    if id_response and uuid_response and (id_response != uuid_response):
        print("Existing id/uuid mismatch")
    if id_response and accession_response and (id_response !=
                                               accession_response):
        print("Existing id/accession mismatch")
    if uuid_response and accession_response and (uuid_response !=
                                                 accession_response):
        print("Existing uuid/accession mismatch")

    if new_object and object_exists:
        print(
            "Conflict:  At least one identifier already exists and at least one does not exist"
        )

    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    if "Dataset" not in supported_collections:
        supported_collections.append("Dataset")

    type_list = new_json.pop('@type', [])
    if args.type:
        type_list = [args.type]
    if any(type_list):
        findit = False
        for x in supported_collections:
            if x.lower() == type_list[0].lower():
                type_list = [x]
                findit = True
        if findit:
            if args.debug:
                print("Object will have type of", type_list[0])
        else:
            print(
                "Error! JSON object does not contain one of the supported types"
            )
            print("Provided type:", type_list[0])
            print(
                "Please either change the JSON file or define the type with the --type feature"
            )
            sys.exit(1)
    else:
        print("No type found for JSON object!")
        sys.exit(1)

    possible_collections = [x for x in type_list if x in supported_collections]
    if possible_collections:
        # collection = possible_collections[0] + 's/'
        collection = possible_collections[0]
    else:
        collection = []
    if '@id' in new_json:
        identifier = new_json.pop('@id')
    elif 'uuid' in new_json:
        if collection:
            identifier = '/' + collection + '/' + new_json['uuid'] + '/'
        else:
            identifier = '/' + new_json['uuid'] + '/'
    elif 'accession' in new_json:
        if collection:
            identifier = '/' + collection + '/' + new_json['accession'] + '/'
        else:
            identifier = '/' + new_json['accession'] + '/'
    if 'attachment' in new_json:
        if 'href' in new_json['attachment']:
            pass
        else:
            try:
                filename = new_json['attachment']['download']
                print("Setting filename to %s" % (filename))
            except:
                print("Must specify either href or filename for attachment",
                      file=sys.stderr)
            if new_json['attachment'].get('type'):
                mime_type = new_json['attachment'].get('type')
            else:
                try:
                    mime_type, encoding = mimetypes.guess_type(filename)
                    major, minor = mime_type.split('/')
                    #detected_type = magic.from_file(filename, mime=True)
                    print("Detected mime type %s" % (mime_type))
                except:
                    print("Failed to detect mime type in file %s" % (filename),
                          file=sys.stderr)
            try:
                with open(filename, 'rb') as stream:
                    print("opened")
                    newvalue = {
                        'download':
                        filename,  # Just echoes the given filename as the download name
                        'type':
                        mime_type,
                        'href':
                        'data:%s;base64,%s' %
                        (mime_type, b64encode(stream.read()))
                    }
                f = open('tmp', 'w')
                print(f, newvalue)
                new_json.update({'attachment': newvalue})  # add
            except:
                print("Cannot open file %s" % (filename), file=sys.stderr)
    if object_exists:
        if args.force_put:
            if not GET_ONLY:
                print("Replacing existing object")
                if args.update:
                    e = encodedcc.replace_ENCODE(identifier, connection,
                                                 new_json)
                    print(e)
        else:
            if not GET_ONLY:
                print("PATCHing existing object")
                if args.update:
                    e = encodedcc.patch_ENCODE(identifier, connection,
                                               new_json)
                    print(e)
    elif new_object:
        if args.force_put:
            if not GET_ONLY:
                print("PUT'ing new object")
                if args.update:
                    e = encodedcc.replace_ENCODE(identifier, connection,
                                                 new_json)
                    print(e)
        else:
            if not GET_ONLY:
                print("POST'ing new object")
                if not any(collection):
                    print(
                        "ERROR: Unable to POST to non-existing collection {}".
                        format(collection))
                    sys.exit(1)
                if args.update:
                    e = encodedcc.new_ENCODE(connection, collection, new_json)
                    print(e)
 def compare_entrez_ENCODE(self, uuid, pmid, connection, extraData={}):
     '''compares value in ENCODE database to results from Entrez
     '''
     encode = encodedcc.get_ENCODE(uuid, connection)
     entrez = self.entrezDict.get(pmid)
     patch = False
     if not entrez:
         log = "PMID " + pmid + " was not found in Entrez database!!"
         logger.warning('%s' % log)
     else:
         log = "PMID " + pmid
         logger.info('%s' % log)
         for key in entrez.keys():
             if key in encode.keys():
                 if entrez[key] == encode[key]:
                     log = "entrez key \"" + key + "\" matches encode key"
                     logger.info('%s' % log)
                 else:
                     log = "\"" + key + "\" value in encode database does not match value in entrez database"
                     logger.warning('%s' % log)
                     log = "\tENTREZ: " + entrez[key] + "\n\tENCODE: " + encode[key]
                     logger.warning('%s' % log)
                     if self.UPDATE or self.UPDATE_ONLY:
                         log = "PATCH in the new value for \"" + key + "\""
                         logger.info('%s' % log)
                         patch_dict = {key: entrez[key]}
                         encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                         patch = True
             else:
                 log = "ENCODE missing \"" + key + "\" from Entrez.  New key and value must be added"
                 logger.warning('%s' % log)
                 if self.UPDATE or self.UPDATE_ONLY:
                     log = "PATCHing in new key \"" + key + "\""
                     logger.info('%s' % log)
                     patch_dict = {key: entrez[key]}
                     encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                     patch = True
         if not self.UPDATE_ONLY:
             for key in extraData.keys():
                 if type(extraData.get(key)) is list:
                     if set(encode.get(key, [])) == set(extraData.get(key, [])):
                         log = "encode \"" + key + "\" matches data in file"
                         logger.info('%s' % log)
                     else:
                         log = "encode \"" + key + "\" value" + str(encode.get(key, [])) + "does not match file"
                         logger.warning('%s' % log)
                         if self.UPDATE:
                             if any(extraData[key]):
                                 patch_dict = {key: extraData[key]}
                                 encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                                 patch = True
                             else:
                                 log = "No value in file to input for \"" + key + "\""
                                 logger.warning('%s' % log)
                 if type(extraData.get(key)) is str:
                     if encode.get(key, "") == extraData.get(key, ""):
                         log = "encode \"" + key + "\" matches data in file"
                         logger.info('%s' % log)
                     else:
                         log = "encode \"" + key + "\" value" + str(encode.get(key, "")) + "does not match file"
                         logger.warning('%s' % log)
                         if self.UPDATE:
                             patch_dict = {key: extraData[key]}
                             encodedcc.patch_ENCODE(uuid, connection, patch_dict)
                             patch = True
         if encode.get("status", "") != "published" and (self.UPDATE or self.UPDATE_ONLY):
             log = "Setting status to published"
             logger.info('%s' % log)
             encodedcc.patch_ENCODE(uuid, connection, {"status": "published"})
             patch = True
         if patch is True:
             self.PATCH_COUNT += 1
def main():
    args = getArgs()
    outfile = args.outfile
    CREATE_ONLY = args.createonly
    UPDATE_ONLY = args.updateonly
    Entrez.email = args.email
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)

    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    print("Running on ", connection.server)

    publication = PublicationUpdate(args)

    if not UPDATE_ONLY:
        publication.setup_publication()
        pmidList = publication.consortium_ids + publication.community_ids
        mergeDicts = publication.consortium_dict.copy()
        mergeDicts.update(publication.community_dict)  # holds published_by, categories, and data_used

        if not CREATE_ONLY:
            publication.get_entrez(pmidList)

        community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID = publication.find_ENCODE_extras(publication.community_ids, publication.consortium_ids, connection)
        total_ENCODE_only = len(community_ENCODE_Only) + len(consortium_ENCODE_Only)
        allOtherIDs = communityOtherID + consortiumOtherID
        publication.check_ENCODE(pmidList, connection, allOtherIDs, mergeDicts)
        log = str(total_ENCODE_only) + " items in ENCODE but not in files"
        logger.info('%s' % log)
        log = str(publication.PATCH_COUNT) + " publication files PATCHed"
        logger.info('%s' % log)
        log = str(publication.POST_COUNT) + " publication files POSTed"
        logger.info('%s' % log)
        print("Results printed to", outfile)
    else:
        infile = UPDATE_ONLY
        with open(infile, 'r') as readfile:
            uuidList = [x.rstrip('\n') for x in readfile]
        # check each publication to see if it has a PMID, if it does add it to the PMIDlist
        # if it does not have one look it up on Entrez
        pmid_uuid_dict = {}
        for uuid in uuidList:
            pub = encodedcc.get_ENCODE(uuid, connection)
            title = pub.get("title", "")
            identifiers = pub.get("identifiers", [])
            found = False
            for i in identifiers:
                if "PMID:" in i:
                    p = i.split(":")[1]
                    found = True
            if found:
                pmid_uuid_dict[p] = uuid
            else:
                # search Entrez for publication by title
                handle = Entrez.esearch(db="pubmed", term=title)
                record = Entrez.read(handle)
                idlist = record["IdList"]
                if len(idlist) > 1:
                    log = "More than one possible PMID found for " + uuid
                    logger.error('%s' % log)
                    log = str(idlist) + " are possible PMIDs"
                    logger.error('%s' % log)
                elif len(idlist) == 0:
                    log = "No possible PMID found for " + uuid
                    logger.error('%s' % log)
                else:
                    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
                    records = Medline.parse(handle)
                    # save the records, you can convert them to a list
                    records = list(records)
                    for record in records:
                        pm = record.get("PMID")
                        ti = record.get("TI")
                        log = "Publication " + uuid + " with title \"" + title + "\" matches PMID:" + pm + " with title \"" + ti + "\""
                        logger.info('%s' % log)
                        identifiers.append("PMID:" + pm)
                        encodedcc.patch_ENCODE(uuid, connection, {"identifiers": identifiers})
                        pmid_uuid_dict[pm] = uuid
        pmidList = list(pmid_uuid_dict.keys())
        publication.get_entrez(pmidList)
        with open("pub_update.txt", "w") as f:
            for pmid in pmid_uuid_dict.keys():
                publication.compare_entrez_ENCODE(pmid_uuid_dict[pmid], pmid, connection)
            f.write(str(len(pmid_uuid_dict.keys())) + " publications checked " + str(publication.PATCH_COUNT) + " publications PATCHed")
Beispiel #24
0
def excel_reader(datafile, sheet, update, connection, patchall):
    row = reader(datafile, sheetname=sheet)
    keys = next(row)  # grab the first row of headers
    total = 0
    error = 0
    success = 0
    patch = 0
    for values in row:
        total += 1
        post_json = dict(zip(keys, values))
        post_json = dict_patcher(post_json)
        # add attchments here
        if post_json.get("attachment"):
            attach = attachment(post_json["attachment"])
            post_json["attachment"] = attach
        print(post_json)
        temp = {}
        if post_json.get("uuid"):
            temp = encodedcc.get_ENCODE(post_json["uuid"], connection)
        elif post_json.get("aliases"):
            temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]),
                                        connection)
        elif post_json.get("accession"):
            temp = encodedcc.get_ENCODE(post_json["accession"], connection)
        elif post_json.get("@id"):
            temp = encodedcc.get_ENCODE(post_json["@id"], connection)
        if temp.get("uuid"):
            if patchall:
                e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
                    patch += 1
            else:
                print(
                    "Object {} already exists.  Would you like to patch it instead?"
                    .format(temp["uuid"]))
                i = input("PATCH? y/n ")
                if i.lower() == "y":
                    e = encodedcc.patch_ENCODE(temp["uuid"], connection,
                                               post_json)
                    if e["status"] == "error":
                        error += 1
                    elif e["status"] == "success":
                        success += 1
                        patch += 1
        else:
            if update:
                print("POSTing data!")
                e = encodedcc.new_ENCODE(connection, sheet, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
    print(
        "{sheet}: {success} out of {total} posted, {error} errors, {patch} patched"
        .format(sheet=sheet.upper(),
                success=success,
                total=total,
                error=error,
                patch=patch))
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on {}".format(connection.server))
    accessions = []
    if args.object:
        if os.path.isfile(args.object):
            accessions = [line.strip() for line in open(args.object)]
        else:
            accessions = args.object.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                else:
                    print("ERROR: object has no identifier", file=sys.stderr)
    if len(accessions) == 0:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        files = encodedcc.get_ENCODE(acc, connection).get("original_files", [])
        new_files = {}
        old_files = {}
        for f in files:
            file = encodedcc.get_ENCODE(f, connection)
            #renamer(file, connection, args.update)
            #replacer(file, connection, args.update)
            if any(file.get("aliases", [])):
                # this has aliases
                if file["aliases"][0].endswith("_replaced"):
                    # this is one of the old ones
                    dict_maker(file, old_files)
                else:
                    # this is a new file
                    dict_maker(file, new_files)
            else:
                print("file {} has no aliases".format(file["@id"]))

        for new in new_files.keys():
            new_temp = new_files[new]
            for old in old_files.keys():
                old_temp = old_files[old]

                if new_temp["replicate"] == old_temp["replicate"]:
                    #print(new_temp["replicate"], old_temp["replicate"])

                    if new_temp["file_type"] == old_temp["file_type"]:
                        #print(new_temp["file_type"], old_temp["file_type"])

                        if new_temp["run_type"] == old_temp["run_type"]:
                            #print(new_temp["run_type"], old_temp["run_type"])

                            if new_temp["paired_end"] == old_temp[
                                    "paired_end"]:
                                #print(new_temp["paired_end"], old_temp["paired_end"])
                                print(
                                    "New file {} with date {} replacing old file {} with date {}"
                                    .format(new, new_temp["date"], old,
                                            old_temp["date"]))
                                if args.update:
                                    #replace old file
                                    encodedcc.patch_ENCODE(
                                        old, connection,
                                        {"status": "replaced"})
                                    # release and update new file
                                    patch_dict = {
                                        "status": "released",
                                        "alternate_accessions": [old]
                                    }
                                    encodedcc.patch_ENCODE(
                                        new, connection, patch_dict)
Beispiel #26
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on", connection.server)
    if args.update:
        assert args.user, "A user must be provided to run this script!"
        user = encodedcc.get_ENCODE(args.user, connection).get("@id")
        assert user, "{} was not found in the ENCODE database as a registered user. Please try again".format(
            args.user)

    data = []
    idList = []
    with open(args.infile, "r") as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        for row in reader:
            data.append(row)
    for item in data:
        lanes = item.get("lanes", "")
        lanes = list(set(lanes.split(",")))
        item["lanes"] = lanes
        if not any(item["notes"]):
            item.pop("notes")
        if item.get("@id") not in idList:
            idList.append(item["@id"])
    objDict = {key: [] for key in idList}
    for item in data:
        objDict.get(item.get("@id", ""), "").append(item)

    for idNum in objDict.keys():
        antibody = encodedcc.get_ENCODE(idNum, connection, frame="edit")
        new_antibody = {}
        if antibody.get("primary_characterization_method"):
            reviews = antibody.get("characterization_reviews", [])
            enc_docs = antibody.get("documents", [])
            file_docs = []
            for obj in objDict[idNum]:
                if obj.get("documents"):
                    for doc in obj["documents"].split(","):
                        file_docs.append(doc)
                if obj.get("notes"):
                    new_antibody["notes"] = obj["notes"]
            for doc in file_docs:
                if ":" in doc:
                    doc = quote(doc)
                link = encodedcc.get_ENCODE(doc, connection).get("@id")
                if link:
                    if link not in enc_docs:
                        enc_docs.append(link)

            #######################
            # begin lanes checking
            #######################
            enc_lanes_check = []
            file_lanes_check = []
            flag = False
            for r in reviews:
                enc_lanes_check.append(r["lane"])
            for item in objDict[idNum]:
                for l in item["lanes"]:
                    file_lanes_check.append(int(l))
            if len(set(enc_lanes_check)) < len(enc_lanes_check):
                # duplicate lanes in ENCODE
                print("Possible duplicate lanes in ENCODE")
                flag = True
            if len(set(file_lanes_check)) < len(file_lanes_check):
                # duplicate lanes in file
                print("Possible duplicate lanes in file")
                flag = True
            if len(set(enc_lanes_check) - set(file_lanes_check)) > 0:
                # more lanes in ENCODE than in file
                print("Found lanes in ENCODE not in the file")
                flag = True
            if len(set(file_lanes_check) - set(enc_lanes_check)) > 0:
                # more lanes in file than in ENCODE
                print("Found lanes in the file not in ENCODE")
                flag = True
            if flag:
                print(
                    "Some problem was found with the number of lanes in the file as compared to ENCODE"
                )
                print(
                    "Do you want to continue running the program or exit and check the data?"
                )
                i = input("Continue? y/n ")
                assert i.upper() == "Y"
                # exit the script
            for r in reviews:
                for line in objDict[idNum]:
                    for lane in line["lanes"]:
                        if int(lane) == r["lane"]:
                            if line["lane_status"].lower(
                            ) == "pending dcc review":
                                print(
                                    "can't set to pending review, need manual override"
                                )
                                fin = input(
                                    "Change the status to 'pending dcc review'? y/n "
                                )
                                if fin.upper() == "Y":
                                    r["lane_status"] = line[
                                        "lane_status"].lower()
                                    for link in enc_docs:
                                        if encodedcc.get_ENCODE(
                                                link, connection
                                        ).get("document_type",
                                              "") == "standards document":
                                            enc_docs.pop(link)
                                else:
                                    pass
                            else:
                                r["lane_status"] = line["lane_status"].lower()
            # now all lanes in reviews should be updated to document
            enc_comp = 0
            enc_ncomp = 0
            other = 0

            for r in reviews:
                if r.get("lane_status", "") == "compliant":
                    enc_comp = enc_comp + 1
                elif r.get("lane_status", "") == "not compliant":
                    enc_ncomp = enc_ncomp + 1
                else:
                    other = other + 1
            if other > 0:
                print(
                    "not all lanes have allowed status, antibody characterization status set to not compliant"
                )
                new_antibody["status"] = "not compliant"
            elif enc_comp > 0:
                new_antibody["status"] = "compliant"
            elif other == 0 and enc_comp == 0 and enc_ncomp > 0:
                new_antibody["status"] = "not compliant"
            ######################
            # end lanes checking
            ######################

            if antibody.get("lab", "") == "/labs/michael-snyder/":
                # make sure special document is added if not in the file
                if "michael-snyder:biorad_protein_standard" not in file_docs:
                    file_docs.append("michael-snyder:biorad_protein_standard")
                if antibody[
                        "primary_characterization_method"] == "immunoprecipitation":
                    if len(reviews) == 1:
                        # fix lane number
                        reviews[0]["lane"] = 3

            new_antibody["characterization_reviews"] = reviews
            new_antibody["documents"] = enc_docs
            if args.update:
                new_antibody["reviewed_by"] = user

        if args.update:
            print("PATCHing antibody characterization", idNum)
            encodedcc.patch_ENCODE(idNum, connection, new_antibody)
        else:
            print("PATCH data:", new_antibody)
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument('--infile', '-i',
                        help="File containing the JSON object as a JSON string.")
    parser.add_argument('--server',
                        help="Full URL of the server.")
    parser.add_argument('--key',
                        default='default',
                        help="The keypair identifier from the keyfile.  \
                        Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --keyfile\
                        =%s" % (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--authid',
                        help="The HTTP auth ID.")
    parser.add_argument('--authpw',
                        help="The HTTP auth PW.")
    parser.add_argument('--force-put',
                        default=False,
                        action='store_true',
                        help="Force the object to be PUT rather than PATCHed.  \
                        Default is False.")
    parser.add_argument('--get-only',
                        default=False,
                        action='store_true',
                        help="Do nothing but get the object and print it.  \
                        Default is False.")
    parser.add_argument('--id',
                        help="URI for an object"),
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")
    parser.add_argument('--frame',
                        help="define a frame to get back the JSON object, for use with --id. Default is frame=object",
                        default="object")
    parser.add_argument('--type',
                        help="the object's type")
    args = parser.parse_args()

    global DEBUG_ON
    DEBUG_ON = args.debug

    if args.get_only:
        GET_ONLY = True
    else:
        GET_ONLY = False

    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)

    new_object = False
    if args.id:
        GET_ONLY = True
        print("Taking id to get from --id")
        new_json = {}
        uuid_response = {}
        accession_response = {}
        try:
            id_response = encodedcc.get_ENCODE(args.id, connection, frame=args.frame)
        except:
            id_response = {}
            new_object = True
    else:
        if args.infile:
            infile = open(args.infile, 'r')
        else:
            infile = sys.stdin

        new_json_string = infile.read()

        new_json = json.loads(new_json_string)
        if '@id' in new_json:
            try:
                id_response = encodedcc.get_ENCODE(new_json['@id'], connection)
            except:
                id_response = {}
                new_object = True
        else:
            id_response = {}
        if 'uuid' in new_json:
            try:
                uuid_response = encodedcc.get_ENCODE(new_json['uuid'], connection)
            except:
                uuid_response = {}
                new_object = True
        else:
            uuid_response = {}
        if 'accession' in new_json:
            try:
                accession_response = encodedcc.get_ENCODE(new_json['accession'], connection)
            except:
                accession_response = {}
                new_object = True
        else:
            print("No identifier in new JSON object.  Assuming POST or PUT with auto-accessioning.")
            new_object = True
            accession_response = {}

    object_exists = False
    if id_response:
        object_exists = True
        print("Found matching @id:")
        encodedcc.pprint_ENCODE(id_response)
    if uuid_response:
        object_exists = True
        print("Found matching uuid:")
        encodedcc.pprint_ENCODE(uuid_response)
    if accession_response:
        object_exists = True
        print("Found matching accession")
        encodedcc.pprint_ENCODE(accession_response)

    if id_response and uuid_response and (id_response != uuid_response):
        print("Existing id/uuid mismatch")
    if id_response and accession_response and (id_response != accession_response):
        print("Existing id/accession mismatch")
    if uuid_response and accession_response and (uuid_response != accession_response):
        print("Existing uuid/accession mismatch")

    if new_object and object_exists:
        print("Conflict:  At least one identifier already exists and at least one does not exist")

    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    if "Dataset" not in supported_collections:
        supported_collections.append("Dataset")

    type_list = new_json.pop('@type', [])
    if args.type:
        type_list = [args.type]
    if any(type_list):
        findit = False
        for x in supported_collections:
            if x.lower() == type_list[0].lower():
                type_list = [x]
                findit = True
        if findit:
            if args.debug:
                print("Object will have type of", type_list[0])
        else:
            print("Error! JSON object does not contain one of the supported types")
            print("Provided type:", type_list[0])
            print("Please either change the JSON file or define the type with the --type feature")
            sys.exit(1)
    else:
        print("No type found for JSON object!")
        sys.exit(1)

    possible_collections = [x for x in type_list if x in supported_collections]
    if possible_collections:
        # collection = possible_collections[0] + 's/'
        collection = possible_collections[0]
    else:
        collection = []
    if '@id' in new_json:
        identifier = new_json.pop('@id')
    elif 'uuid' in new_json:
        if collection:
            identifier = '/' + collection + '/' + new_json['uuid'] + '/'
        else:
            identifier = '/' + new_json['uuid'] + '/'
    elif 'accession' in new_json:
        if collection:
            identifier = '/' + collection + '/' + new_json['accession'] + '/'
        else:
            identifier = '/' + new_json['accession'] + '/'
    if 'attachment' in new_json:
        if 'href' in new_json['attachment']:
            pass
        else:
            try:
                filename = new_json['attachment']['download']
                print("Setting filename to %s" % (filename))
            except:
                print("Must specify either href or filename for attachment", file=sys.stderr)
            if new_json['attachment'].get('type'):
                mime_type = new_json['attachment'].get('type')
            else:
                try:
                    mime_type, encoding = mimetypes.guess_type(filename)
                    major, minor = mime_type.split('/')
                    #detected_type = magic.from_file(filename, mime=True)
                    print("Detected mime type %s" % (mime_type))
                except:
                    print("Failed to detect mime type in file %s" % (filename), file=sys.stderr)
            try:
                with open(filename, 'rb') as stream:
                    print("opened")
                    newvalue = {
                        'download': filename,  # Just echoes the given filename as the download name
                        'type': mime_type,
                        'href': 'data:%s;base64,%s' % (mime_type, b64encode(stream.read()))
                    }
                f = open('tmp', 'w')
                print(f, newvalue)
                new_json.update({'attachment': newvalue})  # add
            except:
                print("Cannot open file %s" % (filename), file=sys.stderr)
    if object_exists:
        if args.force_put:
            if not GET_ONLY:
                print("Replacing existing object")
                e = encodedcc.replace_ENCODE(identifier, connection, new_json)
                print(e)
        else:
            if not GET_ONLY:
                print("Patching existing object")
                e = encodedcc.patch_ENCODE(identifier, connection, new_json)
                print(e)
    elif new_object:
        if args.force_put:
            if not GET_ONLY:
                print("PUT'ing new object")
                e = encodedcc.replace_ENCODE(identifier, connection, new_json)
                print(e)
        else:
            if not GET_ONLY:
                print("POST'ing new object")
                if not any(collection):
                    print("ERROR: Unable to POST to non-existing collection {}".format(collection))
                    sys.exit(1)
                e = encodedcc.new_ENCODE(connection, collection, new_json)
                print(e)
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on {}".format(connection.server))
    accessions = []
    if args.object:
        if os.path.isfile(args.object):
            accessions = [line.strip() for line in open(args.object)]
        else:
            accessions = args.object.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                else:
                    print("ERROR: object has no identifier", file=sys.stderr)
    if len(accessions) == 0:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        files = encodedcc.get_ENCODE(acc, connection).get("original_files", [])
        new_files = {}
        old_files = {}
        for f in files:
            file = encodedcc.get_ENCODE(f, connection)
            #renamer(file, connection, args.update)
            #replacer(file, connection, args.update)
            if any(file.get("aliases", [])):
                # this has aliases
                if file["aliases"][0].endswith("_replaced"):
                    # this is one of the old ones
                    dict_maker(file, old_files)
                else:
                    # this is a new file
                    dict_maker(file, new_files)
            else:
                print("file {} has no aliases".format(file["@id"]))

        for new in new_files.keys():
            new_temp = new_files[new]
            for old in old_files.keys():
                old_temp = old_files[old]

                if new_temp["replicate"] == old_temp["replicate"]:
                    #print(new_temp["replicate"], old_temp["replicate"])

                    if new_temp["file_type"] == old_temp["file_type"]:
                        #print(new_temp["file_type"], old_temp["file_type"])

                        if new_temp["run_type"] == old_temp["run_type"]:
                            #print(new_temp["run_type"], old_temp["run_type"])

                            if new_temp["paired_end"] == old_temp["paired_end"]:
                                #print(new_temp["paired_end"], old_temp["paired_end"])
                                print("New file {} with date {} replacing old file {} with date {}".format(new, new_temp["date"], old, old_temp["date"]))
                                if args.update:
                                    #replace old file
                                    encodedcc.patch_ENCODE(old, connection, {"status": "replaced"})
                                    # release and update new file
                                    patch_dict = {"status": "released", "alternate_accessions": [old]}
                                    encodedcc.patch_ENCODE(new, connection, patch_dict)
Beispiel #29
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    keypair = (key.authid, key.authpw)
    server = key.server
    query = args.query

    objects = \
        encoded_get(server + 'search/?type=AntibodyLot' +
                    '&type=Donor&type=Biosample' +
                    '&type=File&type=Library' +
                    '&type=Dataset&type=Pipeline' +
                    '&type=Replicate' +
                    '&type=Treatment&format=json&' +
                    'frame=object&limit=all&' + query, keypair)['@graph']
    print('There are ' + str(len(objects)) +
          ' objects that should be inspected on the portal')
    counter = 0
    for obj in objects:
        counter += 1
        if counter % 1000 == 0:
            print('Script processed ' + str(counter) + ' objects')
        if obj['status'] not in ['replaced']:
            patching_data = {}

            # fixing links of donor
            fix_replaced_references(obj, 'parent_strains', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'identical_twin', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'outcrossed_strain', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'littermates', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'fraternal_twin', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'parents', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'children', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'siblings', patching_data, keypair,
                                    server)

            # fixing links of file/experiment/biosample
            fix_replaced_references(obj, 'derived_from', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'paired_with', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'controlled_by', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'possible_controls', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'supersedes', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'dataset', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'related_files', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'related_datasets', patching_data,
                                    keypair, server)

            # fixing links of biosample
            fix_replaced_references(obj, 'host', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'part_of', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'originated_from', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'pooled_from', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'donor', patching_data, keypair,
                                    server)

            # fixing links of library
            fix_replaced_references(obj, 'biosample', patching_data, keypair,
                                    server)

            # fixing links of treatment
            fix_replaced_references(obj, 'biosamples_used', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'antibodies_used', patching_data,
                                    keypair, server)

            # fixing links of replicate
            fix_replaced_references(obj, 'antibody', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'experiment', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'library', patching_data, keypair,
                                    server)
            if patching_data:
                print('Patching object ' + obj['@type'][0] + '\t' +
                      obj['uuid'])
                print('OLD DATA:')
                for k in patching_data:
                    print('\t' + k + '\t' + str(obj[k]))
                print('---------')
                print('NEW DATA:')
                for k in patching_data:
                    print('\t' + k + '\t' + str(patching_data[k]))
                print('---------')
                encodedcc.patch_ENCODE(obj['uuid'], connection, patching_data)