コード例 #1
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.object:
        if os.path.isfile(args.object):
            accessions = [line.strip() for line in open(args.object)]
        else:
            accessions = args.object.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                elif obj.get("aliases"):
                    accessions.append(obj["aliases"][0])
                else:
                    print("ERROR: object has no identifier", file=sys.stderr)
    if len(accessions) == 0:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        encodedcc.get_ENCODE(acc, connection)
コード例 #2
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)

    encodedcc.patch_set(args, connection)
コード例 #3
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on", key.server)
    # build the PROFILES reference dictionary
    release = Data_Release(args, connection)
    release.run_script()
コード例 #4
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    profiles = encodedcc.get_ENCODE('/profiles/', connection)
    for object_type in profiles.keys():
        profile_properties = encodedcc.get_ENCODE(
            '/profiles/' + object_type, connection).get('properties')
        # we should fix only objects that have alternate accessions property
        if profile_properties and profile_properties.get(
                'alternate_accessions'):
            uuid_2_alternate_accessions = {}
            objects = encodedcc.get_ENCODE('search/?type=' + object_type,
                                           connection)['@graph']
            for entry in objects:
                if entry.get('alternate_accessions'):
                    replaced_objects_accessions = []
                    for acc in entry.get('alternate_accessions'):
                        replaced_objects_accessions.extend(
                            retreive_list_of_replaced(acc,
                                                      connection))
                    if sorted(list(set(
                        replaced_objects_accessions))) != sorted(
                       entry.get('alternate_accessions')):
                        uuid_2_alternate_accessions[entry['uuid']] = \
                            set(replaced_objects_accessions)

            for uuid in uuid_2_alternate_accessions.keys():
                uuid_sets_counter = 0
                for key in uuid_2_alternate_accessions.keys():
                    if uuid_2_alternate_accessions[uuid] <= \
                       uuid_2_alternate_accessions[key]:
                        uuid_sets_counter += 1
                if uuid_sets_counter == 1:
                    for acc in list(uuid_2_alternate_accessions[uuid]):
                        to_clean_objects = encodedcc.get_ENCODE(
                            'search/?type=Item&accession=' + acc,
                            connection)['@graph']
                        for object_to_clean in to_clean_objects:
                            print(object_to_clean['uuid'] +
                                  ' alternate accessions list ' +
                                  str(object_to_clean[
                                      'alternate_accessions']) +
                                  ' is removed')
                            encodedcc.patch_ENCODE(
                                object_to_clean['uuid'],
                                connection,
                                {"alternate_accessions": []})

                    print(uuid + ' is patched with ' +
                          str({"alternate_accessions": list(
                              uuid_2_alternate_accessions[uuid])}))
                    encodedcc.patch_ENCODE(
                        uuid,
                        connection,
                        {"alternate_accessions": list(
                            uuid_2_alternate_accessions[uuid])})
コード例 #5
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    if args.datatype == 'CHIP':
        make_chip_report(connection)
    elif args.datatype == 'RNA':
        make_rna_report(connection)
    else:
        print('unimplimented')
コード例 #6
0
def main():

    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('--infile',
                        '-i',
                        default='obList',
                        help="File containing a list of ENCSRs.")
    parser.add_argument('--search',
                        default='NULL',
                        help="The search parameters.")
    parser.add_argument(
        '--key',
        default='default',
        help=
        "The keypair identifier from the keyfile.  Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --keyfile=%s" %
                        (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")
    parser.add_argument('--field',
                        default='accession',
                        help="The field to report.  Default is accession.")
    args = parser.parse_args()

    DEBUG_ON = args.debug

    myKey = encodedcc.ENC_Key(args.keyfile, args.key)
    myConnect = encodedcc.ENC_Connection(myKey)

    #Get list of objects we are interested in

    objList = get_experiment_list(args.infile, args.search, myConnect)
    for i in range(0, len(objList)):

        field = ''
        if objList[i] != '':
            ob = encodedcc.get_ENCODE(objList[i], myConnect)
            id = ob.get('@id')
            if args.field in ob:
                field = str(ob[args.field])
        else:
            id = objList[i]
        print('\t'.join([id, field]))
コード例 #7
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    data = encodedcc.get_ENCODE(args.query, connection).get("@graph")
    print("Experiment\tStatus\tControl\tStatus")
    for exp in data:
        if exp.get("possible_controls"):
            if exp["status"] != "released":
                c = exp["possible_controls"][0]
                control = encodedcc.get_ENCODE(c, connection)
                if control["status"] == "released":
                    print("{}\t{}\t{}\t{}".format(exp["accession"],
                                                  exp["status"],
                                                  control["accession"],
                                                  control["status"]))
    '''
コード例 #8
0
def main():
    headers = [
        "accession", "description", "organism", "age_display", "life_stage",
        "sex", "biosample_term_name", "biosample_type",
        "depleted_in_term_name", "phase", "subcellular_fraction_term_name",
        "post_synchronization_time", "post_synchronization_time_units",
        "synchronization", "model_organism_mating_status", "treatments",
        "donor", "transfection_type", "talens", "constructs",
        "model_organism_donor_constructs", "rnais", "part_of", "pooled_from",
        "derived_from", "status", "culture_harvest_date", "culture_start_date",
        "date_obtained", "lab", "source", "note", "notes", "health_status",
        "starting_amount", "starting_amount_units"
    ]
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.query:
        temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        for obj in temp:
            accessions.append(obj.get("accession"))
    elif args.infile:
        accessions = [line.strip() for line in open(args.infile)]
    elif args.accession:
        accessions = [args.accession]
    else:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    data = []
    for acc in accessions:
        temp = {}
        obj = encodedcc.get_ENCODE(acc, connection)
        for h in headers:
            x = obj.get(h, "")
            if any(x):
                temp[h] = x
            else:
                temp[h] = ""
        data.append(temp)
    writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=headers)
    writer.writeheader()
    for d in data:
        writer.writerow(d)
コード例 #9
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    encodedcc.get_fields(args, connection)
    data = []
    with open(args.outfile, "r") as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        for row in reader:
            data.append(row)
    for x in data:
        for key in x.keys():
            if key != "accession":
                x[key] = format_number(x[key])
    header = ["accession", args.onefield]
    with open(args.outfile, "w") as tsvfile:
        writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=header)
        writer.writeheader()
        for x in data:
            writer.writerow(x)
    args.infile = args.outfile
    encodedcc.patch_set(args, connection)
コード例 #10
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.infile:
        if os.path.isfile(args.infile):
            accessions = [line.strip() for line in open(args.infile)]
        else:
            accessions = args.infile.split(",")
    elif args.query:
        data = []
        if "search" in args.query:
            data = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            data = [encodedcc.get_ENCODE(args.query, connection)]
        for exp in data:
            files = exp.get("files", [])
            for f in files:
                res = encodedcc.get_ENCODE(f, connection)
                f_type = res.get("file_format", "")
                if f_type == "fastq":
                    accessions.append(res["accession"])
    else:
        print("No accessions to check")
        sys.exit(1)
    for acc in accessions:
        link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz"
        for header, sequence, qual_header, quality in encodedcc.fastq_read(
                connection, uri=link):
            if args.header:
                header = header.decode("UTF-8")
                print(acc + "\t" + str(len(sequence)) + "\t" + header)
            else:
                sequence = sequence.decode("UTF-8")
                print(acc + "\t" + str(len(sequence)))
コード例 #11
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    if args.lib:
        libraries = [line.strip() for line in open(args.lib)]
        lib_alias = dict.fromkeys(libraries)
        # http://graveleylab.cam.uchc.edu/ENCODE/ENCODE_DATA/protocol/LV08_library_protocol/L-AKAP1-LV08-3.pdf
        #print (lib_alias)
        for key in lib_alias.keys():
            lib_alias[key] = "brenton-graveley:" + key.split("/")[-1].split(
                ".")[0]
        for key in lib_alias.keys():
            file_manager(key, lib_alias[key], connection, "Library")

    if args.bio:
        biosamples = [line.strip() for line in open(args.bio)]
        bio_alias = dict.fromkeys(biosamples)
        # http://graveleylab.cam.uchc.edu/ENCODE/ENCODE_DATA/protocol/LV08_biosample_protocol/DDX3X-LV08-15.pdf
        for key in bio_alias.keys():
            bio_alias[key] = "brenton-graveley:" + key.split("/")[-1].split(
                ".")[0]
        for key in bio_alias.keys():
            file_manager(key, bio_alias[key], connection, "Biosample")
コード例 #12
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on {server}".format(server=connection.server))
    if not os.path.isfile(args.infile):
        print("File {filename} not found!".format(filename=args.infile))
        sys.exit(1)
    if args.type:
        names = [args.type]
    else:
        book = xlrd.open_workbook(args.infile)
        names = book.sheet_names()
    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    supported_collections = [s.lower() for s in list(profiles.keys())]
    for n in names:
        if n.lower() in supported_collections:
            excel_reader(args.infile, n, args.update, connection,
                         args.patchall)
        else:
            print("Sheet name '{name}' not part of supported object types!".
                  format(name=n),
                  file=sys.stderr)
コード例 #13
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    output = encodedcc.GetFields(connection, args)
    output.get_fields()
コード例 #14
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    keypair = (key.authid, key.authpw)
    server = key.server
    query = args.query

    objects = \
        encoded_get(server + 'search/?type=AntibodyLot' +
                    '&type=Donor&type=Biosample' +
                    '&type=File&type=Library' +
                    '&type=Dataset&type=Pipeline' +
                    '&type=Replicate' +
                    '&type=Treatment&format=json&' +
                    'frame=object&limit=all&' + query, keypair)['@graph']
    print('There are ' + str(len(objects)) +
          ' objects that should be inspected on the portal')
    counter = 0
    for obj in objects:
        counter += 1
        if counter % 1000 == 0:
            print('Script processed ' + str(counter) + ' objects')
        if obj['status'] not in ['replaced']:
            patching_data = {}

            # fixing links of donor
            fix_replaced_references(obj, 'parent_strains', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'identical_twin', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'outcrossed_strain', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'littermates', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'fraternal_twin', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'parents', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'children', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'siblings', patching_data, keypair,
                                    server)

            # fixing links of file/experiment/biosample
            fix_replaced_references(obj, 'derived_from', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'paired_with', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'controlled_by', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'possible_controls', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'supersedes', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'dataset', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'related_files', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'related_datasets', patching_data,
                                    keypair, server)

            # fixing links of biosample
            fix_replaced_references(obj, 'host', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'part_of', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'originated_from', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'pooled_from', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'donor', patching_data, keypair,
                                    server)

            # fixing links of library
            fix_replaced_references(obj, 'biosample', patching_data, keypair,
                                    server)

            # fixing links of treatment
            fix_replaced_references(obj, 'biosamples_used', patching_data,
                                    keypair, server)
            fix_replaced_references(obj, 'antibodies_used', patching_data,
                                    keypair, server)

            # fixing links of replicate
            fix_replaced_references(obj, 'antibody', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'experiment', patching_data, keypair,
                                    server)
            fix_replaced_references(obj, 'library', patching_data, keypair,
                                    server)
            if patching_data:
                print('Patching object ' + obj['@type'][0] + '\t' +
                      obj['uuid'])
                print('OLD DATA:')
                for k in patching_data:
                    print('\t' + k + '\t' + str(obj[k]))
                print('---------')
                print('NEW DATA:')
                for k in patching_data:
                    print('\t' + k + '\t' + str(patching_data[k]))
                print('---------')
                encodedcc.patch_ENCODE(obj['uuid'], connection, patching_data)
コード例 #15
0
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument('collection', help="The collection to get")
    parser.add_argument('--es',
                        default=False,
                        action='store_true',
                        help="Use elasticsearch")
    parser.add_argument(
        '--query',
        help="A complete query to run rather than GET the whole collection.  \
                        E.g. \"search/?type=biosample&lab.title=Ross Hardison, PennState\".  Implies --es."
    )
    parser.add_argument(
        '--submittable',
        default=False,
        action='store_true',
        help="Show only properties you might want a submitter to submit.")
    parser.add_argument('--key',
                        default='default',
                        help="The keypair identifier from the keyfile.  \
                        Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --\
                        keyfile=%s" % (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")

    args = parser.parse_args()

    keys = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(keys)

    global DEBUG
    DEBUG = args.debug

    supplied_name = args.collection

    if supplied_name.endswith('s'):
        search_name = supplied_name.rstrip('s').replace('-', '_')
        schema_name = search_name + '.json'
    elif supplied_name.endswith('.json'):
        schema_name = supplied_name
        search_name = supplied_name.rstrip('.json')
    else:
        search_name = supplied_name.replace('-', '_')
        schema_name = search_name + '.json'

    schema_uri = '/profiles/' + schema_name
    object_schema = encodedcc.get_ENCODE(schema_uri, connection)
    headings = []
    for schema_property in object_schema["properties"]:
        property_type = object_schema["properties"][schema_property]["type"]
        if isinstance(
                property_type, list
        ):  # hack to deal with multi-typed properties, just pick the first one
            property_type = property_type[0]
        if property_type == 'string':  # if it's a string type, the heading is just the property name
            headings.append(schema_property)
        elif property_type == 'array':  # format the heading to be property_name:type:array or, if an array of strings, property_name:array
            if 'items' in object_schema["properties"][schema_property].keys():
                whateveritscalled = "items"
            elif 'reference' in object_schema["properties"][
                    schema_property].keys():
                whateveritscalled = "reference"
            elif 'url' in object_schema["properties"][schema_property].keys():
                whateveritscalled = "url"
            else:
                print(object_schema["properties"][schema_property].keys())
                raise NameError("None of these match anything I know")
            if object_schema["properties"][schema_property][whateveritscalled][
                    "type"] == 'string':
                headings.append(schema_property + ':array')
            else:
                try:
                    headings.append(
                        schema_property + ':' + object_schema["properties"]
                        [schema_property][whateveritscalled]["type"] +
                        ':array')
                except:
                    headings.append(schema_property + ':mixed:array')
        else:  # it isn't a string, and it isn't an array, so make the heading property_name:type
            headings.append(schema_property + ':' + property_type)
    headings.sort()
    if 'file' in supplied_name or 'dataset' in supplied_name or 'source' in supplied_name or 'award' in supplied_name:
        pass
    else:
        # headings.append('award.rfa') #need to add a parameter to specify additional properties
        pass
    if 'file' in supplied_name:
        headings.append('replicate.biological_replicate_number')
        headings.append('replicate.technical_replicate_number')
    if 'biosample' in supplied_name:
        headings.append('organ_slims')
    if 'access-key' in supplied_name:
        headings.append('user.title')
    if 'user' in supplied_name:
        headings.append('title')

    exclude_unsubmittable = [
        'accession', 'uuid', 'schema_version', 'alternate_accessions',
        'submitted_by'
    ]

    global collection
    if args.query:
        uri = args.query
        collection = encodedcc.get_ENCODE(uri, connection)
    elif args.es:
        uri = '/search/?type=' + search_name
        collection = encodedcc.get_ENCODE(uri, connection)
    else:
        collection = get_without_ESearch(search_name, connection)
    collected_items = collection['@graph']

    headstring = ""
    for heading in headings:
        if args.submittable and heading.split(':')[0] in exclude_unsubmittable:
            pass
        else:
            headstring += heading + '\t'
    headstring = headstring.rstrip()
    print(headstring)

    for item in collected_items:
        # obj = encodedcc.get_ENCODE(item['@id'], connection)
        obj = item
        obj = encodedcc.flat_ENCODE(obj)
        rowstring = ""
        for header in headstring.split('\t'):
            prop_key = header.split(':')[0]
            if prop_key in obj:
                tempstring = json.dumps(obj[prop_key]).lstrip('"').rstrip('"')
                if tempstring == '[]':
                    tempstring = ""
                rowstring += tempstring + '\t'
            elif '.' in prop_key:
                try:
                    embedded_key = obj[prop_key.split('.')[0]]
                    if '/' in embedded_key:
                        embedded_obj = encodedcc.get_ENCODE(
                            embedded_key, connection)
                    else:
                        embedded_obj = encodedcc.get_ENCODE(
                            prop_key.split('.')[0] + '/' +
                            obj[prop_key.split('.')[0]], connection)
                    embedded_value_string = json.dumps(embedded_obj[
                        prop_key.split('.')[1]]).lstrip('"').rstrip('"')
                    if embedded_value_string == '[]':
                        embedded_value_string = ""
                except KeyError:
                    embedded_value_string = ""
                rowstring += embedded_value_string + '\t'
            else:
                rowstring += '\t'
        rowstring = rowstring.rstrip()
        print(rowstring)
コード例 #16
0
ファイル: reporter.py プロジェクト: mmmika/pyencoded-tools
def main():

    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument('--infile',
                        '-i',
                        default='objList',
                        help="File containing a list of ENCSRs.")
    parser.add_argument('--search',
                        default='NULL',
                        help="The search parameters.")
    parser.add_argument(
        '--datatype',
        default='OTHER',
        help="The datatype format to print your report. (CHIP,RNA,REPLI,OTHER)"
    )
    parser.add_argument(
        '--key',
        default='default',
        help=
        "The keypair identifier from the keyfile.  Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --keyfile=%s" %
                        (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")
    parser.add_argument('--details',
                        default=False,
                        action='store_true',
                        help="Print detailed report.  Default off")
    parser.add_argument('--status',
                        default=False,
                        action='store_true',
                        help="Print statuses of each object.  Default off")
    parser.add_argument('--mouse',
                        default=False,
                        action='store_true',
                        help="Print mouse specific details.  Default off")
    parser.add_argument('--simple',
                        default=False,
                        action='store_true',
                        help="Very simple output.  Default off")
    parser.add_argument('--library',
                        default=False,
                        action='store_true',
                        help="Print library details.  Default off")
    parser.add_argument('--files',
                        default=False,
                        action='store_true',
                        help="Print a file based report. Default off")
    parser.add_argument(
        '--nhgri',
        default=False,
        action='store_true',
        help="Print a library based report based on standards. Default off")
    parser.add_argument('--encode2',
                        default=False,
                        action='store_true',
                        help="Print dbxrefs for ENCODE2.  Default off")
    args = parser.parse_args()

    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    '''Adjust the checked list by the datatype'''
    if args.datatype != 'CHIP':
        checkedItems.remove('theTarget')
        checkedItems.remove('control_exps')
        repCheckedItems.remove('antibody')
        repCheckedItems.remove('antibody_status')
        repCheckedItems.remove('antibody_source')
        repCheckedItems.remove('antibody_product')
        repCheckedItems.remove('antibody_lot')

    if args.datatype != 'REPLI':
        libraryCheckedItems.remove('phase')

    if args.datatype != 'RNA':
        libraryCheckedItems.remove('subcellular_fraction_term_name')
        libraryCheckedItems.remove('library_treatments')
        libraryCheckedItems.remove('depleted_in_term_name')
        libraryCheckedItems.remove('spikeins_used')

    if args.simple:
        if args.datatype == 'CHIP':
            repCheckedItems.remove('antibody_status')
            repCheckedItems.remove('antibody_source')
            repCheckedItems.remove('antibody_product')
            repCheckedItems.remove('antibody_lot')

    if not args.details:
        checkedItems.remove('project')
        checkedItems.remove('submitter')
        checkedItems.remove('grant')
        checkedItems.remove('assay_term_id')
        checkedItems.remove('biosample_term_id')
        libraryCheckedItems.remove('nucleic_acid_term_id')
        libraryCheckedItems.remove('biosample_biosample_term')
        libraryCheckedItems.remove('biosample_biosample_id')
        libraryCheckedItems.remove('biosample_biosample_type')

    if not args.library:
        libraryCheckedItems.remove('lysis_method')
        libraryCheckedItems.remove('fragmentation_method')
        libraryCheckedItems.remove('fragmentation_date')
        libraryCheckedItems.remove('extraction_method')
        libraryCheckedItems.remove('library_size_selection_method')
        libraryCheckedItems.remove('size_range')
        libraryCheckedItems.remove('nucleic_acid_starting_quantity')
        libraryCheckedItems.remove('nucleic_acid_starting_quantity_units')

    if not args.status:
        libraryCheckedItems.remove('library_status')
        libraryCheckedItems.remove('biosample_status')
        libraryCheckedItems.remove('donor_status')
        repCheckedItems.remove('rep_status')
        checkedItems.remove('status')

    if not args.encode2:
        checkedItems.remove('dbxrefs')

    if not args.mouse:
        libraryCheckedItems.remove('strain')
        libraryCheckedItems.remove('strain_background')

    if args.files:
        print('\t'.join(fileCheckedItems))
    else:
        print('\t'.join(checkedItems + repCheckedItems + libraryCheckedItems))

    # Get list of objects we are interested in
    search = args.search
    objList = get_experiment_list(args.infile, search, connection)

    if args.files:
        reporter_files.files(objList, fileCheckedItems, connection)
        return
    else:
        for i in range(0, len(objList)):

            exp = encodedcc.get_ENCODE(objList[i],
                                       connection,
                                       frame='embedded')
            ob = {}

            for i in checkedItems:
                if i in exp:
                    ob[i] = exp[i]
                else:
                    ob[i] = ''
            '''Get the counts'''
            if 'replicates' in exp:
                ob['replicate_count'] = len(exp['replicates'])
            else:
                ob['replicate_count'] = 0
            if 'documents' in exp:
                ob['document_count'] = len(exp['documents'])
                ob['experiment_documents'] = get_doc_list(exp['documents'])
            else:
                ob['document_count'] = 0
                ob['experiment_documents'] = []
            if 'files' in exp:
                ob['file_count'] = len(exp['files'])
            else:
                ob['file_count'] = 0
            '''Get the experiment level ownership'''

            ob['lab_name'] = exp['lab']['name']
            ob['project'] = exp['award'].get('rfa')
            ob['grant'] = exp['award']['name']
            ob['submitter'] = exp['submitted_by']['title']
            ob['experiment_documents'] = get_doc_list(exp['documents'])

            temp = ''
            for i in range(0, len(exp['dbxrefs'])):
                temp = temp + ' ; ' + exp['dbxrefs'][i]
            ob['dbxrefs'] = temp

            ob['control_exps'] = ''
            if 'possible_controls' in exp:
                for q in exp['possible_controls']:
                    ob['control_exps'] = ob['control_exps'] + \
                        ' ' + q['accession']
            else:
                ob['control_exps'] = []

            if 'target' in exp:
                ob['theTarget'] = exp['target']['label']

            files_count = {}
            files_list = {}
            repIds = []
            for item in exp['files']:

                if item.get('biological_replicates') is None:
                    repId = 'no rep'
                elif len(item['biological_replicates']) == 1:
                    repId = item['biological_replicates'][0]
                else:
                    repId = 'no rep'

                if repId in files_list:
                    files_list[repId].append(item['accession'])
                else:
                    files_list[repId] = [item['accession']]

                if repId in files_count:
                    files_count[repId] = files_count[repId] + 1
                else:
                    files_count[repId] = 1

            libs = []

            for q in range(0, ob['replicate_count']):
                rep = exp['replicates'][q]
                '''Inititalize rep object'''
                repOb = {}
                for field in libraryCheckedItems:
                    repOb[field] = ''
                for field in repCheckedItems:
                    if field in rep:
                        repOb[field] = rep[field]
                    else:
                        repOb[field] = ''
                if rep['biological_replicate_number'] in files_count:
                    repOb['files'] = files_list[
                        rep['biological_replicate_number']]
                    repOb['rep_file_count'] = files_count[
                        rep['biological_replicate_number']]
                else:
                    repOb['rep_file_count'] = 0
                    repOb['files'] = []
                repOb['replicate_aliases'] = rep['aliases']
                repOb['replicate_uuid'] = rep['uuid']
                repOb['rep_status'] = rep['status']
                if 'platform' in rep:
                    repOb['platform'] = rep['platform']['term_name']
                if 'antibody' in rep:
                    repOb['antibody'] = rep['antibody']['accession']
                    summary = get_char_summary(rep['antibody']['accession'],
                                               connection)
                    if len(rep['antibody']['lot_reviews']) < 1:
                        continue
                    print('\t'.join([
                        'NHGRI',
                        exp['accession'],
                        rep['antibody']['accession'],
                        rep['antibody']['lot_reviews'][0]['status'],
                        'Characterizations failing:' +
                        repr(summary['number_chars_failing']),
                        'Characterizations passing:' +
                        repr(summary['number_chars_passing']),
                        'Characterizations in progress:' +
                        repr(summary['number_chars_in_progress']),
                    ]))
                    # repOb['antibody_status'] = rep['antibody']['approvals'][0]['status']
                    repOb['antibody_source'] = rep['antibody']['source']
                    repOb['antibody_product'] = rep['antibody']['product_id']
                    repOb['antibody_lot'] = rep['antibody']['lot_id']
                    repOb['antibody_status'] = rep['antibody']['lot_reviews'][
                        0]['status']
                lib = []

                # inititalize the lib with repItems
                for i in repCheckedItems:
                    if i in repOb:
                        lib.append(repr(repOb[i]))

                if 'library' in rep:

                    for field in libraryCheckedItems:
                        if field in rep['library']:
                            repOb[field] = rep['library'][field]
                    repOb['protocols'] = get_doc_list(
                        rep['library']['documents'])
                    repOb['library_treatments'] = get_treatment_list(
                        rep['library']['treatments'])
                    repOb['spikeins_used'] = get_spikeins_list(
                        rep['library'].get('spikeins_used'))
                    repOb['library_status'] = rep['library']['status']
                    if 'biosample' in rep['library']:
                        bs = rep['library']['biosample']
                        repOb['biosample_accession'] = bs['accession']
                        repOb['biosample_status'] = bs['status']
                        try:
                            repOb['biosample_biosample_term'] = bs[
                                'biosample_term_name']
                        except:
                            print(
                                "Skipping missing biosample_term_name in %s" %
                                (bs['accession']),
                                file=sys.stderr)
                            repOb['biosample_biosample_term'] = ""
                        repOb['biosample_biosample_id'] = bs[
                            'biosample_term_id']
                        repOb['biosample_biosample_type'] = bs[
                            'biosample_type']
                        ob['species'] = bs['organism']['name']
                        if 'subcellular_fraction_term_name' in bs:
                            repOb['subcellular_fraction_term_name'] = bs[
                                'subcellular_fraction_term_name']
                        else:
                            repOb[
                                'subcellular_fraction_term_name'] = 'unfractionated'

                        if bs['treatments'] != []:
                            repOb[
                                'biological_treatments'] = get_treatment_list(
                                    bs['treatments'])

                        if 'donor' in bs:
                            repOb['donor'] = bs['donor']['accession']
                            repOb['donor_status'] = bs['donor']['status']
                            repOb['strain'] = bs['donor'].get('strain')
                            repOb['strain_background'] = bs['donor'].get(
                                'strain_background')
                        for term in ('sex', 'phase', 'age', 'age_units',
                                     'life_stage'):
                            repOb[term] = bs.get(term)

                    temp = ' '.join(rep['library']['aliases'])
                    repOb['aliases'] = temp
                    ob['list_libraries'] = ''
                    ob['list_libraries'] = ob['list_libraries'] + \
                        ' ' + rep['library']['accession']

                    for i in libraryCheckedItems:
                        if i in repOb:
                            lib.append(repr(repOb[i]))
                        else:
                            lib.append('')
                libs.append(lib)

            row = []
            for j in checkedItems:
                row.append(str(ob[j]))
            if len(libs) == 0:
                print('\t'.join(row))
            for k in range(0, len(libs)):
                print('\t'.join(row + libs[k]))
コード例 #17
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    control = "/search/?type=experiment&award.project=Roadmap&status=released&assay_term_name=ChIP-seq&target.investigated_as=control"
    missing_control = "/search/?type=experiment&award.project=Roadmap&status=released&audit.NOT_COMPLIANT.category=missing+possible_controls"

    control_list = encodedcc.get_ENCODE(control, connection, frame="embedded").get("@graph", [])
    missing_list = encodedcc.get_ENCODE(missing_control, connection, frame="embedded").get("@graph", [])
    links = {}
    missing_accessions = []
    control_accessions = []
    for value in missing_list:
        missing_accessions.append(value["accession"])
    for value in control_list:
        control_accessions.append(value["accession"])
    print("building dictionary")
    for obj in control_list:
        if obj.get("replicates"):
            bio_acc = obj["replicates"][0]["library"]["biosample"]["accession"]
            links[bio_acc] = [[], []]
    print("sorting data")
    for obj in control_list:
        if obj.get("replicates"):
            control_acc = obj["accession"]
            bio_acc = obj["replicates"][0]["library"]["biosample"]["accession"]
            links[bio_acc][0].append(control_acc)
    for obj in missing_list:
        missing_acc = obj["accession"]
        # print(missing_acc)
        if obj.get("replicates"):
            bio_acc = obj["replicates"][0]["library"]["biosample"]["accession"]
        if links.get(bio_acc):
            links[bio_acc][1].append(missing_acc)
    found_controls = []
    found_missings = []
    print("writing data")
    with open("results.txt", "w") as f:
        f.write("biosample" + "\t" + "possible control" + "\t" + "possible experiments" + "\n")
        for key in links.keys():
            c = ";".join(links[key][0])
            for item in links[key][0]:
                found_controls.append(item)
            e = ";".join(links[key][1])
            for item in links[key][1]:
                found_missings.append(item)
            if len(links[key][1]) > 0:
                s = key + "\t" + c + "\t" + e +"\n"
                f.write(s)
    print("checking for left over items")
    diff_missing = set(missing_accessions) - set(found_missings)
    header = ["experiment", "biosample", "description", "controls", "control description", "control biosample", "lab", "control lab", "bio term name", "control term name", "bio type", "control bio type", "age", "control age", "organism", "control organism"]
    temp_list = []
    possible = []
    for acc in diff_missing:
        exp = encodedcc.get_ENCODE(acc, connection, frame="embedded")
        temp = {}
        if exp.get("replicates"):
            bio = exp["replicates"][0]["library"]["biosample"]["accession"]
            lab = exp["lab"]["@id"]
            bio_name = exp["replicates"][0]["library"]["biosample"]["biosample_term_name"]
            bio_type = exp["replicates"][0]["library"]["biosample"]["biosample_type"]
            bio_age = exp["replicates"][0]["library"]["biosample"]["age"]
            organism = exp["replicates"][0]["library"]["biosample"]["organism"]["name"]
            des = exp["replicates"][0]["library"]["biosample"].get("description", "NONE")
            for con in control_list:
                if con.get("replicates"):
                    con_id = con["accession"]
                    con_bio = con["replicates"][0]["library"]["biosample"]["accession"]
                    con_lab = con["lab"]["@id"]
                    con_bio_name = con["replicates"][0]["library"]["biosample"]["biosample_term_name"]
                    con_bio_type = con["replicates"][0]["library"]["biosample"]["biosample_type"]
                    con_age = con["replicates"][0]["library"]["biosample"]["age"]
                    con_organism = con["replicates"][0]["library"]["biosample"]["organism"]["name"]
                    con_des = con["replicates"][0]["library"]["biosample"].get("description", "NONE")
                    if bio_name == con_bio_name and lab == con_lab and bio_type == con_bio_type and organism == con_organism:
                        possible.append(acc)
                        temp["experiment"] = acc
                        temp["biosample"] = bio
                        temp["description"] = des
                        temp["controls"] = con_id
                        temp["control description"] = con_des
                        temp["control biosample"] = con_bio
                        temp["lab"] = lab
                        temp["control lab"] = con_lab
                        temp["bio term name"] = bio_name
                        temp["control term name"] = con_bio_name
                        temp["bio type"] = bio_type
                        temp["control bio type"] = con_bio_type
                        temp["age"] = bio_age
                        temp["control age"] = con_age
                        temp["organism"] = organism
                        temp["control organism"] = con_organism
        temp_list.append(temp)
    with open("missing_control.txt", "w") as tsvfile:
        writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=header)
        writer.writeheader()
        for item in temp_list:
            writer.writerow(item)
    really_missing = set(diff_missing) - set(possible)
    with open("extras.txt", "w") as f:
        f.write(str(len(really_missing)) + " experiments unaccounted for\n")
        for line in really_missing:
            f.write(line + "\n")
    print("output written to results.txt, missing_controls.txt and extras.txt")
    '''if args.update:
コード例 #18
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)

    labs = {
        'stam': '&lab.title=John+Stamatoyannopoulos%2C+UW&lab.title=Job+Dekker%2C+UMass',
        'bernstein': '&lab.title=Bradley+Bernstein%2C+Broad',
        'gingeras': '&lab.title=Yijun+Ruan%2C+GIS&lab.title=Thomas+Gingeras%2C+CSHL&lab.title=Piero+Carninci%2C+RIKEN',
        'snyder': '&lab.title=Michael+Snyder%2C+Stanford&lab.title=Sherman+Weissman%2C+Yale&lab.title=Kevin+White%2C+UChicago&lab.title=Peggy+Farnham%2C+USC'
    }

    # ----------- QUERIES ----------------------------------------------------
    unreplicated_query = '&replication_type=unreplicated'
    replicated_query = '&replication_type!=unreplicated'
    not_pipeline_query = '&files.analysis_step_version.analysis_step.pipelines.title%21=Transcription+factor+ChIP-seq'
    no_peaks_query = '&files.file_type!=bigBed+narrowPeak'
    concordance_query = '&searchTerm=IDR%3Afail'  #'&searchTerm=IDR%3Afail'
    unrunnable_query = '&internal_status=unrunnable'
    pipeline_query = '&files.analysis_step_version.analysis_step.pipelines.title=Transcription+factor+ChIP-seq'
    read_depth_query = '&audit.NOT_COMPLIANT.category=insufficient+read+depth'
    read_depth_query_3 = '&audit.WARNING.category=low+read+depth'
    complexity_query = '&audit.NOT_COMPLIANT.category=insufficient+library+complexity'
    read_length_query = '&files.read_length=271272&files.read_length=657265&files.read_length=25&files.read_length=31&files.read_length=30'
    no_concerns_query = '&internal_status%21=requires+lab+review&internal_status%21=unrunnable&internal_status%21=pipeline+error'

    human_query = '&replicates.library.biosample.donor.organism.scientific_name=H**o+sapiens'
    mouse_query = '&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus'
    unknown_org_query = '&replicates.library.biosample.donor.organism.scientific_name%21=H**o+sapiens&replicates.library.biosample.donor.organism.scientific_name%21=Mus+musculus'
    ENCODE2_query = '&award.rfa=ENCODE2&award.rfa=ENCODE2-Mouse'
    ENCODE3_query = '&award.rfa=ENCODE3'
    ROADMAP_query = '&award.rfa=Roadmap'
    total_query = '&status=released&status=submitted&status=started&status=ready+for+review'
    released_query = '&status=released'
    proposed_query = '&status=proposed'
    unreleased_query = '&status=submitted&status=ready+for+review&status=started'
    concerns_query = '&internal_status=requires+lab+review&internal_status=unrunnable&internal_status=pipeline+error&status!=deleted&status!=revoked'
    antibody_query = '&audit.NOT_COMPLIANT.category=not+characterized+antibody'
    orange_audits_query = '&audit.NOT_COMPLIANT.category=missing+controlled_by&audit.NOT_COMPLIANT.category=insufficient+read+depth&audit.NOT_COMPLIANT.category=missing+documents&audit.NOT_COMPLIANT.category=control+insufficient+read+depth&audit.NOT_COMPLIANT.category=unreplicated+experiment&audit.NOT_COMPLIANT.category=poor+library+complexity&audit.NOT_COMPLIANT.category=severe+bottlenecking&audit.NOT_COMPLIANT.category=insufficient+replicate+concordance&audit.NOT_COMPLIANT.category=missing+possible_controls&audit.NOT_COMPLIANT.category=missing+input+control'
    red_audits_query = '&audit.ERROR.category=control+extremely+low+read+depth&audit.ERROR.category=missing+raw+data+in+replicate&audit.ERROR.category=missing+donor&audit.ERROR.category=inconsistent+library+biosample&audit.ERROR.category=inconsistent+replicate&audit.ERROR.category=replicate+with+no+library&audit.ERROR.category=technical+replicates+with+not+identical+biosample&&audit.ERROR.category=missing+paired_with&audit.ERROR.category=missing+possible_controls&audit.ERROR.category=inconsistent+control&audit.ERROR.category=missing+antibody'
    orange_audits_query2 = '&audit.NOT_COMPLIANT.category=insufficient+read+length&audit.NOT_COMPLIANT.category=control+low+read+depth&audit.NOT_COMPLIANT.category=insufficient+read+depth&audit.NOT_COMPLIANT.category=missing+documents&audit.NOT_COMPLIANT.category=unreplicated+experiment&audit.NOT_COMPLIANT.category=missing+possible_controls&audit.NOT_COMPLIANT.category=missing+spikeins&audit.NOT_COMPLIANT.category=missing+RNA+fragment+size'
    peaks_query = '&files.file_type=bigBed+narrowPeak'
    missing_signal_query = '&files.file_type!=bigWig&target.investigated_as!=control'
    grch38_query = '&files.assembly=GRCh38'
    v19_query = '&files.genome_annotation=V19'
    not_v19_query = '&files.genome_annotation!=V19'
    hg19_query = '&files.assembly=hg19'
    mm10_query = '&files.assembly=mm10'
    hg19_vis_query = '&assembly=hg19'
    grch38_vis_query = '&assembly=GRCh38'
    not_grch38_vis_query = '&assembly!=GRCh38'
    mm10_vis_query = '&assembly=mm10'
    not_mm10_vis_query = '&assembly!=mm10'
    not_grch38_query = '&files.assembly!=GRCh38'
    not_hg19_query = '&files.assembly!=hg19'
    not_mm10_query = '&files.assembly!=mm10'
    uniform_query = '&files.lab.name=encode-processing-pipeline'
    requires_query = '&internal_status=requires+lab+review'
    submitted_query = '&files.lab.name!=encode-processing-pipeline'
    audits_query = '&audit.NOT_COMPLIANT.category=missing+controlled_by&audit.NOT_COMPLIANT.category=insufficient+read+depth&audit.NOT_COMPLIANT.category=missing+documents&audit.NOT_COMPLIANT.category=unreplicated+experiment&assay_slims=Transcription&audit.NOT_COMPLIANT.category=missing+possible_controls&audit.NOT_COMPLIANT.category=missing+spikeins&audit.NOT_COMPLIANT.category=missing+RNA+fragment+size'
    processing_query = '&internal_status=pipeline+ready&internal_status=processing'
    mismatched_file_query = '&audit.INTERNAL_ACTION.category=mismatched+file+status'
    dnase_pipeline = "&files.analysis_step_version.analysis_step.pipelines.title=DNase-HS+pipeline+%28paired-end%29&files.analysis_step_version.analysis_step.pipelines.title=DNase-HS+pipeline+%28single-end%29&files.file_type=bigBed+broadPeak"
    lab_query = labs.get(args.grant)

    filters = {
        'released': released_query,
        'unreleased': total_query
    }

    row_queries = {
        'Total': total_query,
        'Released': released_query,
        'Released with issues': released_query+audits_query,
        'Released with antibody issues': released_query + antibody_query,
        'Released with NOT COMPLIANT issues': released_query + orange_audits_query,
        'Released with NOT COMPLIANT': released_query + orange_audits_query2,
        'Released with ERROR issues': released_query + red_audits_query,
        'With ERROR issues':  red_audits_query + filters[args.status],
        'With NOT COMPLIANT issues': orange_audits_query + filters[args.status],
        'Unreleased': unreleased_query,
        'Proposed': proposed_query,
        'Processed on GRCh38': grch38_query + filters[args.status],
        'Processed on Dnase Grch38 or mm10': dnase_pipeline + grch38_vis_query + mm10_vis_query + filters[args.status],
        'Mapped on GRCh38 or mm10': grch38_query + mm10_query + filters[args.status],
        'Unmapped on GRCh38 or mm10': not_grch38_query + not_mm10_query + filters[args.status],
        'Submitted on GRCh38': grch38_query + filters[args.status],
        'Submitted on GRCh38 or mm10': grch38_query + mm10_query + filters[args.status],
        'Uniformly Processed on hg19-v19': v19_query + filters[args.status],
        'Mapped on hg19': hg19_query + uniform_query + filters[args.status],
        'Unmapped on hg19': not_hg19_query + filters[args.status],
        'Processed on Dnase hg19': dnase_pipeline + hg19_vis_query + filters[args.status],
        'Peaks called on hg19': hg19_vis_query + uniform_query + filters[args.status],
        'Peaks called on GRCh38 or mm10': grch38_vis_query + mm10_vis_query + filters[args.status],
        'Submitted on hg19': hg19_query + filters[args.status],
        'Processed on mm10': mm10_query + filters[args.status],
        'Submitted on mm10': mm10_query + filters[args.status],
        'Cannot be currently processed': concerns_query + filters[args.status],
        'In processing queue': processing_query + filters[args.status],
        'Unreleased files in a released experiment': mismatched_file_query,
        'Missing GRCh38 or mm10 peaks': not_grch38_vis_query + not_mm10_vis_query + replicated_query + filters[args.status],
        'Missing hg19': not_hg19_query + not_mm10_query + filters[args.status],
        'Missing hg19-v19': not_v19_query + not_mm10_query + filters[args.status],
        'Missing signal files': total_query + missing_signal_query,
        'missing fastqs': '&files.file_format!=fastq' + total_query,
        'Unreplicated': unreplicated_query + filters[args.status]
    }

    columns = collections.OrderedDict([
        ('ENCODE3-human', ENCODE3_query + human_query),
        ('ENCODE3-mouse', ENCODE3_query + mouse_query),
        ('ENCODE2-human', ENCODE2_query + human_query),
        ('ENCODE2-mouse', ENCODE2_query + mouse_query),
        # ('Organism Unknown', ENCODE3_query + unknown_org_query),
        ('ROADMAP', ROADMAP_query),
        ('Total', '&award.rfa=ENCODE3' + ROADMAP_query + ENCODE2_query)
        ])

    if args.grant:
        columns = collections.OrderedDict([
            ('ENCODE3-human', ENCODE3_query + human_query + lab_query),
            ('ENCODE3-mouse', ENCODE3_query + mouse_query + lab_query),
            ('ENCODE2-human', ENCODE2_query + human_query + lab_query),
            ('ENCODE2-mouse', ENCODE2_query + mouse_query + lab_query),
            # ('Organism Unknown', ENCODE3_query + unknown_org_query),
            ('ROADMAP', ROADMAP_query + lab_query),
            ('Total', '&award.rfa=ENCODE3' + ROADMAP_query + ENCODE2_query + lab_query)
        ])

    if args.datatype == 'CHIP':
        make_chip_report(connection, columns, row_queries)
    elif args.datatype == 'RNA':
        make_rna_report(connection, columns, row_queries)
    elif args.datatype == 'METHYL':
        make_methyl_report(connection, columns, row_queries)
    elif args.datatype == '3D':
        make_3d_report(connection, columns, row_queries)
    elif args.datatype == 'Accessibility':
        make_dna_report(connection, columns, row_queries)
    elif args.datatype == 'RBP':
        make_rbp_report(connection, row_queries)
    else:
        print ('unimplemented')
コード例 #19
0
def main():

    args = get_args()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    if not test_encode_keys(connection):
        logger.error("Invalid ENCODE server or keys: server=%s auth=%s" %
                     (connection.server, connection.auth))
        sys.exit(1)

    input_csv, output_csv = init_csvs(args.infile, args.outfile)

    for n, row in enumerate(input_csv, start=2):  # row 1 is the header
        # if there is no "file_format_spec" then no point in running get_asfile()
        if row.get("file_format_specifications"):
            as_file = get_asfile(row['file_format_specifications'], connection)
            as_file.close()  # validateFiles needs a closed file for -as, otherwise it gives a return code of -11
            validated = validate_file(
                row, args.encvaldata, row.get('assembly'), as_file.name)
            os.unlink(as_file.name)
        else:
            validated = validate_file(
                row, args.encvaldata, row.get('assembly'))

        if not validated:
            logger.warning('Skipping row %d: file %s failed validation' % (
                n, row['submitted_file_name']))
            continue

        json_payload = process_row(row, connection)
        if not json_payload:
            logger.warning(
                'Skipping row %d: invalid field format for JSON' % (n))
            continue

        file_object = encodedcc.post_file(
            json_payload, connection, args.update)
        if isinstance(file_object, requests.models.Response):
            if file_object.status_code == 409:
                print("POST Conflict", file_object.json())
                i = input("Upload file to S3? y/n: ")
                if i.lower() == "y":
                    detail = file_object.json()["detail"]
                    # pull out the list with the 'key conflict' and turn it into a list
                    conflict = ast.literal_eval(
                        detail.lstrip("Keys conflict: "))
                    # get the first tuple in the list
                    conflict = conflict[0]
                    # the second item in the tuple is the value
                    obj = conflict[1]
                    print(
                        "Getting upload credentials from conflicting identifier {}".format(obj))
                    if ":" in obj:
                        obj = quote(obj)
                    temp_object = encodedcc.get_ENCODE(obj, connection)
                    file_object = encodedcc.ENC_Item(
                        connection, temp_object["@id"])
                    print("Uploading file to S3")
                    aws_return_code = uploader(file_object, args.update)
                else:
                    logger.warning(
                        'Skipping row %d: POST file object failed' % (n))
                    aws_return_code = None
        else:
            if not file_object:
                logger.warning(
                    'Skipping row %d: POST file object failed' % (n))
                aws_return_code = None
                continue
            else:
                aws_return_code = uploader(file_object, args.update)

        output_csv.writeheader()
        output_row = {}
        for key in output_csv.fieldnames:
            output_row.update({key: file_object.get(key)})
        output_row.update({'aws_return': aws_return_code})

        output_csv.writerow(output_row)
コード例 #20
0
def test_connection():
    key = encodedcc.ENC_Key(keypairs, "default")
    connection = encodedcc.ENC_Connection(key)
    assert(connection)
    assert(connection.auth)
    assert(connection.server)
コード例 #21
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.update:
        print("This is an UPDATE run data will be PATCHed")
    else:
        print("This is a dryrun, no data will be changed")
    if args.infile:
        if os.path.isfile(args.infile):
            accessions = [line.rstrip('\n') for line in open(args.infile)]
        else:
            accessions = args.infile.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                elif obj.get("aliases"):
                    accessions.append(obj["aliases"][0])
    if len(accessions) == 0:
        # if something happens and we end up with no accessions stop
        print("ERROR: object has no identifier", file=sys.stderr)
        sys.exit(1)
    else:
        for acc in accessions:
            obj = encodedcc.get_ENCODE(acc, connection, frame="embedded")
            isValid = True
            check = ["replicates", "files"]
            for c in check:
                if not obj.get(c):
                    if args.debug:
                        print("Missing {} for {}".format(c, acc),
                              file=sys.stderr)
                    isValid = False
            if obj.get("possible_controls"):
                for p in obj["possible_controls"]:
                    for c in check:
                        if not obj.get(c):
                            if args.debug:
                                print("Missing {} for {}".format(
                                    c, p["accession"]),
                                      file=sys.stderr)
                            isValid = False
            else:
                isValid = False
                if args.debug:
                    print("Missing possible_controls for {}".format(acc),
                          file=sys.stderr)
            if isValid:
                backfill = BackFill(connection,
                                    debug=args.debug,
                                    missing=args.missing,
                                    update=args.update,
                                    ignore_runtype=args.ignore_runtype)
                if args.method == "single":
                    if args.debug:
                        print("SINGLE REP {}".format(acc))
                    backfill.single_rep(obj)
                elif args.method == "multi":
                    if args.debug:
                        print("MULTI REP {}".format(acc))
                    backfill.multi_rep(obj)
                elif args.method == "biosample":
                    if args.debug:
                        print("BIOSAMPLE {}".format(acc))
                    backfill.multi_control(obj)
                else:
                    exp_rep = len(obj["replicates"])
                    exp_con = len(obj["possible_controls"])
                    if exp_con == 1:
                        # one possible control
                        con_rep = len(
                            obj["possible_controls"][0]["replicates"])
                        if con_rep == exp_rep:
                            # same number experiment replicates as control replicates
                            # method is multi
                            if args.debug:
                                print("MULTI REP {}".format(acc))
                            backfill.multi_rep(obj)
                        elif con_rep == 1:
                            # one control replicate and multiple experiment replicates
                            # method is single
                            if args.debug:
                                print("SINGLE REP {}".format(acc))
                            backfill.single_rep(obj)
                        else:
                            if args.debug:
                                print(
                                    "Experiment {} contains {} experiment replicates and {} control replicates and so does not fit the current pattern!"
                                    .format(acc, exp_rep, con_rep))
                    elif exp_con > 1:
                        # more than one possible control
                        con_reps = 0
                        for con in obj["possible_controls"]:
                            if len(con["replicates"]) == 1:
                                con_reps += 1
                        if con_reps == exp_rep:
                            # same number of controls with one replicate as number of experiment replicates
                            # method is biosample
                            if args.debug:
                                print("BIOSAMPLE {}".format(acc))
                            backfill.multi_control(obj)
                        else:
                            if args.debug:
                                print(
                                    "Experiment {} contains {} experiment replicates and {} control replicates between {} total controls and so does not fit the current pattern!"
                                    .format(acc, exp_rep, con_rep, exp_con))
                    else:
                        if args.debug:
                            print(
                                "Experiment {} does not fit any of the current patterns!"
                                .format(acc))

                if len(backfill.dataList) > 0:
                    print("Experiment\tMethod\tExperimentFile\tControlFile")
                    for data in backfill.dataList:
                        print(
                            "{ExpAcc}\t{Method}\t{ExpFile}\t{ConFile}".format(
                                ExpAcc=data["ExpAcc"],
                                Method=data["Method"],
                                ExpFile=data["ExpFile"],
                                ConFile=data["ConFile"]))
コード例 #22
0
def main():
    args = getArgs()
    outfile = args.outfile
    CREATE_ONLY = args.createonly
    UPDATE_ONLY = args.updateonly
    Entrez.email = args.email
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)

    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    print("Running on ", connection.server)

    publication = PublicationUpdate(args)

    if not UPDATE_ONLY:
        publication.setup_publication()
        pmidList = publication.consortium_ids + publication.community_ids
        mergeDicts = publication.consortium_dict.copy()
        mergeDicts.update(publication.community_dict)  # holds published_by, categories, and data_used

        if not CREATE_ONLY:
            publication.get_entrez(pmidList)

        community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID = publication.find_ENCODE_extras(publication.community_ids, publication.consortium_ids, connection)
        total_ENCODE_only = len(community_ENCODE_Only) + len(consortium_ENCODE_Only)
        allOtherIDs = communityOtherID + consortiumOtherID
        publication.check_ENCODE(pmidList, connection, allOtherIDs, mergeDicts)
        log = str(total_ENCODE_only) + " items in ENCODE but not in files"
        logger.info('%s' % log)
        log = str(publication.PATCH_COUNT) + " publication files PATCHed"
        logger.info('%s' % log)
        log = str(publication.POST_COUNT) + " publication files POSTed"
        logger.info('%s' % log)
        print("Results printed to", outfile)
    else:
        infile = UPDATE_ONLY
        with open(infile, 'r') as readfile:
            uuidList = [x.rstrip('\n') for x in readfile]
        # check each publication to see if it has a PMID, if it does add it to the PMIDlist
        # if it does not have one look it up on Entrez
        pmid_uuid_dict = {}
        for uuid in uuidList:
            pub = encodedcc.get_ENCODE(uuid, connection)
            title = pub.get("title", "")
            identifiers = pub.get("identifiers", [])
            found = False
            for i in identifiers:
                if "PMID:" in i:
                    p = i.split(":")[1]
                    found = True
            if found:
                pmid_uuid_dict[p] = uuid
            else:
                # search Entrez for publication by title
                handle = Entrez.esearch(db="pubmed", term=title)
                record = Entrez.read(handle)
                idlist = record["IdList"]
                if len(idlist) > 1:
                    log = "More than one possible PMID found for " + uuid
                    logger.error('%s' % log)
                    log = str(idlist) + " are possible PMIDs"
                    logger.error('%s' % log)
                elif len(idlist) == 0:
                    log = "No possible PMID found for " + uuid
                    logger.error('%s' % log)
                else:
                    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
                    records = Medline.parse(handle)
                    # save the records, you can convert them to a list
                    records = list(records)
                    for record in records:
                        pm = record.get("PMID")
                        ti = record.get("TI")
                        log = "Publication " + uuid + " with title \"" + title + "\" matches PMID:" + pm + " with title \"" + ti + "\""
                        logger.info('%s' % log)
                        identifiers.append("PMID:" + pm)
                        encodedcc.patch_ENCODE(uuid, connection, {"identifiers": identifiers})
                        pmid_uuid_dict[pm] = uuid
        pmidList = list(pmid_uuid_dict.keys())
        publication.get_entrez(pmidList)
        with open("pub_update.txt", "w") as f:
            for pmid in pmid_uuid_dict.keys():
                publication.compare_entrez_ENCODE(pmid_uuid_dict[pmid], pmid, connection)
            f.write(str(len(pmid_uuid_dict.keys())) + " publications checked " + str(publication.PATCH_COUNT) + " publications PATCHed")
コード例 #23
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    assemblies = ['hg19', 'GRCh38']
    summary = []

    if args.infile is not None and 'ENCSR' in args.infile:
        objList = [args.infile]
    else:
        objList = get_experiment_list(args.infile, args.query, connection)

    for obj_id in objList:
        results = {}

        obj = get_ENCODE(obj_id, connection, frame='page')

        # Get basic info
        reps = get_replicate_count(obj)
        results['rep_count'] = len(reps)
        results['status'] = obj['status']
        results['internal_status'] = obj['internal_status']
        results['award'] = obj['award'].get('rfa')
        results['peaks'] = {}
        results['mapping'] = {}
        results['unarchived_files'] = []
        results['status issues'] = []
        results['accession'] = obj['accession']

        # Get audits
        for level in ['WARNING', 'ERROR', 'NOT_COMPLIANT', 'INTERNAL_ACTION']:
            if obj['audit'].get(level):
                results[level] = len(obj['audit'].get(level))

        # Get status issues
        actions = obj['audit'].get('INTERNAL_ACTION')
        if actions:
            status_issues = [
                i for i in actions if i['category'] in [
                    'experiment not submitted to GEO',
                    'mismatched file status', 'mismatched status'
                ]
            ]
            results['status issues'] = status_issues

        # Inspect files

        good_files = [
            f for f in obj['files']
            if f['status'] in ['released', 'in progress']
        ]
        fastqs = [
            f for f in obj['files']
            if f['status'] in ['released', 'in progress']
        ]
        print("There are files in this experiment:", len(obj['files']))
        print("There are good files in this experiment:", len(good_files))
        # look for unarchived processed files from other labs
        processed_files = [
            f for f in obj['files'] if f['file_format'] != 'fastq'
        ]
        external_files = [
            f for f in processed_files
            if (f['lab']['name'] != 'encode-processing-pipeline')
        ]
        unarchived_files = [
            f for f in external_files if (f['status'] != 'archived')
        ]
        results['unarchived_files'] = unarchived_files

        for assembly in assemblies:
            replicates = []
            file_list = [
                f for f in good_files if f.get('assembly') == assembly
            ]
            for rep in reps:
                rep_obj = {'rep': rep}
                file_list_rep = [
                    f for f in file_list
                    if rep in f.get('biological_replicates')
                ]
                aligns = [
                    f for f in file_list_rep
                    if f.get('output_type') == 'alignments'
                ]
                rep_obj['aligns'] = len(aligns)
                raw_aligns = [
                    f for f in file_list_rep
                    if f.get('output_type') == 'unfiltered alignments'
                ]
                rep_obj['raws'] = len(raw_aligns)
                replicates.append(rep_obj)
            failing_replicates = [f for f in replicates if f['aligns'] == 0]
            if len(failing_replicates) is 0:
                results['mapping'][assembly] = True
            elif len(replicates) == len(failing_replicates):  # They all fail
                results['mapping'][assembly] = False
            else:
                results['mapping'][assembly] = []
                for rep in failing_replicates:
                    results['mapping'][assembly].append(rep['rep'])

            peaks = [f for f in file_list if f.get('output_type') == 'peaks']
            if len(peaks) > 0:
                results['peaks'][assembly] = True
            else:
                results['peaks'][assembly] = False

        summary.append(results)

    unarchived_list = [r for r in summary if len(r['unarchived_files']) > 0]
    print('These experiments have unarchived files', len(unarchived_list))
    for item in unarchived_list:
        print(item['accession'])
    print('')
    print('')

    exps_mismatched_states = [
        r for r in summary if len(r['status issues']) > 0
    ]
    print('These experiments have mismatched states',
          len(exps_mismatched_states))
    for item in exps_mismatched_states:
        print(item['accession'])
    print('')
    print('')

    # not_mapped_GRCh38 = [r for r in summary if r['missing_aligns']['GRCh38'] is False]

    exps_missing_hg38_mapping = [
        r for r in summary if r['mapping']['GRCh38'] is False
    ]
    print('These experiments are missing GRCh38 mapping for all replicates',
          len(exps_missing_hg38_mapping))
    for item in exps_missing_hg38_mapping:
        print(item['accession'], item['status'], item['internal_status'])
    print('')
    print('')

    exps_partial_hg38_mapping = [
        r for r in summary if r['mapping']['GRCh38'] is not False
        and r['mapping']['GRCh38'] is not True
    ]
    print('These experiments are missing GRCh38 mapping for some replicates',
          len(exps_partial_hg38_mapping))
    for item in exps_partial_hg38_mapping:
        print(item['accession'], item['status'], item['internal_status'],
              item['mapping']['GRCh38'])
    print('')
    print('')

    exps_missing_hg38_peaks = [
        r for r in summary if r['peaks']['GRCh38'] is False
    ]
    exps_missing_hg38_peaks_but_have_mapping = [
        f for f in exps_missing_hg38_peaks
        if f['peaks']['GRCh38'] is False and f not in exps_missing_hg38_mapping
        and f not in exps_partial_hg38_mapping
    ]
    print('These experiments are missing GRCh38 peaks but having all mappings',
          len(exps_missing_hg38_peaks_but_have_mapping))
    for item in exps_missing_hg38_peaks:
        print(item['accession'], item['status'], item['internal_status'])
    print('')
    print('')

    exps_missing_hg19_mapping = [
        r for r in summary if r['mapping']['hg19'] is False
    ]
    print('These experiments are missing hg19 mapping for all replicates',
          len(exps_missing_hg19_mapping))
    for item in exps_missing_hg19_mapping:
        print(item['accession'], item['status'], item['internal_status'])
    print('')
    print('')

    exps_partial_hg19_mapping = [
        r for r in summary if r['mapping']['hg19'] is not False
        and r['mapping']['hg19'] is not True
    ]
    print('These experiments are missing hg19 mapping for some replicates',
          len(exps_partial_hg19_mapping))
    for item in exps_partial_hg19_mapping:
        print(item['accession'], item['status'], item['internal_status'],
              item['mapping']['hg19'])
    print('')
    print('')

    exps_missing_hg19_peaks = [
        r for r in summary
        if r['peaks']['hg19'] is False and r not in exps_missing_hg19_mapping
        and r not in exps_partial_hg19_mapping
    ]
    print('These experiments are missing hg19 peaks',
          len(exps_missing_hg19_peaks))
    for item in exps_missing_hg19_peaks:
        print(item['accession'], item['status'], item['internal_status'],
              'warnings:', item.get('WARNING'))
    print('')
    print('')
コード例 #24
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on", connection.server)
    if args.update:
        assert args.user, "A user must be provided to run this script!"
        user = encodedcc.get_ENCODE(args.user, connection).get("@id")
        assert user, "{} was not found in the ENCODE database as a registered user. Please try again".format(
            args.user)

    data = []
    idList = []
    with open(args.infile, "r") as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        for row in reader:
            data.append(row)
    for item in data:
        lanes = item.get("lanes", "")
        lanes = list(set(lanes.split(",")))
        item["lanes"] = lanes
        if not any(item["notes"]):
            item.pop("notes")
        if item.get("@id") not in idList:
            idList.append(item["@id"])
    objDict = {key: [] for key in idList}
    for item in data:
        objDict.get(item.get("@id", ""), "").append(item)

    for idNum in objDict.keys():
        antibody = encodedcc.get_ENCODE(idNum, connection, frame="edit")
        new_antibody = {}
        if antibody.get("primary_characterization_method"):
            reviews = antibody.get("characterization_reviews", [])
            enc_docs = antibody.get("documents", [])
            file_docs = []
            for obj in objDict[idNum]:
                if obj.get("documents"):
                    for doc in obj["documents"].split(","):
                        file_docs.append(doc)
                if obj.get("notes"):
                    new_antibody["notes"] = obj["notes"]
            for doc in file_docs:
                if ":" in doc:
                    doc = quote(doc)
                link = encodedcc.get_ENCODE(doc, connection).get("@id")
                if link:
                    if link not in enc_docs:
                        enc_docs.append(link)

            #######################
            # begin lanes checking
            #######################
            enc_lanes_check = []
            file_lanes_check = []
            flag = False
            for r in reviews:
                enc_lanes_check.append(r["lane"])
            for item in objDict[idNum]:
                for l in item["lanes"]:
                    file_lanes_check.append(int(l))
            if len(set(enc_lanes_check)) < len(enc_lanes_check):
                # duplicate lanes in ENCODE
                print("Possible duplicate lanes in ENCODE")
                flag = True
            if len(set(file_lanes_check)) < len(file_lanes_check):
                # duplicate lanes in file
                print("Possible duplicate lanes in file")
                flag = True
            if len(set(enc_lanes_check) - set(file_lanes_check)) > 0:
                # more lanes in ENCODE than in file
                print("Found lanes in ENCODE not in the file")
                flag = True
            if len(set(file_lanes_check) - set(enc_lanes_check)) > 0:
                # more lanes in file than in ENCODE
                print("Found lanes in the file not in ENCODE")
                flag = True
            if flag:
                print(
                    "Some problem was found with the number of lanes in the file as compared to ENCODE"
                )
                print(
                    "Do you want to continue running the program or exit and check the data?"
                )
                i = input("Continue? y/n ")
                assert i.upper() == "Y"
                # exit the script
            for r in reviews:
                for line in objDict[idNum]:
                    for lane in line["lanes"]:
                        if int(lane) == r["lane"]:
                            if line["lane_status"].lower(
                            ) == "pending dcc review":
                                print(
                                    "can't set to pending review, need manual override"
                                )
                                fin = input(
                                    "Change the status to 'pending dcc review'? y/n "
                                )
                                if fin.upper() == "Y":
                                    r["lane_status"] = line[
                                        "lane_status"].lower()
                                    for link in enc_docs:
                                        if encodedcc.get_ENCODE(
                                                link, connection
                                        ).get("document_type",
                                              "") == "standards document":
                                            enc_docs.pop(link)
                                else:
                                    pass
                            else:
                                r["lane_status"] = line["lane_status"].lower()
            # now all lanes in reviews should be updated to document
            enc_comp = 0
            enc_ncomp = 0
            other = 0

            for r in reviews:
                if r.get("lane_status", "") == "compliant":
                    enc_comp = enc_comp + 1
                elif r.get("lane_status", "") == "not compliant":
                    enc_ncomp = enc_ncomp + 1
                else:
                    other = other + 1
            if other > 0:
                print(
                    "not all lanes have allowed status, antibody characterization status set to not compliant"
                )
                new_antibody["status"] = "not compliant"
            elif enc_comp > 0:
                new_antibody["status"] = "compliant"
            elif other == 0 and enc_comp == 0 and enc_ncomp > 0:
                new_antibody["status"] = "not compliant"
            ######################
            # end lanes checking
            ######################

            if antibody.get("lab", "") == "/labs/michael-snyder/":
                # make sure special document is added if not in the file
                if "michael-snyder:biorad_protein_standard" not in file_docs:
                    file_docs.append("michael-snyder:biorad_protein_standard")
                if antibody[
                        "primary_characterization_method"] == "immunoprecipitation":
                    if len(reviews) == 1:
                        # fix lane number
                        reviews[0]["lane"] = 3

            new_antibody["characterization_reviews"] = reviews
            new_antibody["documents"] = enc_docs
            if args.update:
                new_antibody["reviewed_by"] = user

        if args.update:
            print("PATCHing antibody characterization", idNum)
            encodedcc.patch_ENCODE(idNum, connection, new_antibody)
        else:
            print("PATCH data:", new_antibody)
コード例 #25
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    query = "/search/?type=Experiment&lab.title=Brenton+Graveley%2C+UConn&award.project=ENCODE&status=released&files.file_type=bam"
    data = encodedcc.get_ENCODE(query, connection).get("@graph", [])
    headers = [
        "File Accession", "Download", "Annotation", "Cell Line", "Assembly",
        "Target", "Experiment Accession", "Experiment Aliases",
        "Control Experiment", "Biosample Accession", "Biosample Aliases",
        "Library Accession", "Library Aliases", "Lab", "Submitted Name"
    ]
    with open("output.txt", "w") as tsvfile:
        writer = csv.DictWriter(tsvfile, fieldnames=headers, delimiter="\t")
        writer.writeheader()
        for exp in data:
            if exp.get("possible_controls"):
                print("Experiment", exp.get("accession"))
                temp = dict.fromkeys(headers)
                temp["Experiment Accession"] = exp.get("accession")
                temp["Experiment Aliases"] = exp.get("aliases")
                temp["Cell Line"] = exp.get("biosample_term_name")
                temp["Target"] = exp.get("target")
                temp["Control Experiment"] = exp["possible_controls"]
                if exp.get("files"):
                    files = exp["files"]
                else:
                    files = exp["original_files"]
                for f in files:
                    file = encodedcc.get_ENCODE(f, connection)
                    if file.get("file_format", "") == "bam":
                        # this is a bam file and we want it
                        temp["Lab"] = file.get("lab")
                        temp["Annotation"] = file.get("genome_annotation")
                        temp["File Accession"] = file.get("accession")
                        temp["Submitted Name"] = file.get(
                            "submitted_file_name")
                        temp[
                            "Download"] = connection.server + "/files/" + file[
                                "accession"] + "/@@download/" + file[
                                    "accession"] + ".bam"
                        temp["Assembly"] = file.get("assembly")
                        print("File", file.get("accession"))
                        if file.get("replicate"):
                            rep = encodedcc.get_ENCODE(file["replicate"],
                                                       connection)
                            if rep.get("library"):
                                lib = encodedcc.get_ENCODE(
                                    rep["library"], connection)
                                temp["Library Accession"] = lib.get(
                                    "accession")
                                temp["Library Aliases"] = lib.get("aliases")
                                print("Library", lib.get("accession"))
                                if lib.get("biosample"):
                                    bio = encodedcc.get_ENCODE(
                                        lib["biosample"], connection)
                                    temp["Biosample Accession"] = bio.get(
                                        "accession")
                                    temp["Biosample Aliases"] = bio.get(
                                        "aliases")
                                    print("Biosample", bio.get("accession"))
                        writer.writerow(temp)
コード例 #26
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on {}".format(connection.server))
    accessions = []
    if args.object:
        if os.path.isfile(args.object):
            accessions = [line.strip() for line in open(args.object)]
        else:
            accessions = args.object.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                else:
                    print("ERROR: object has no identifier", file=sys.stderr)
    if len(accessions) == 0:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        files = encodedcc.get_ENCODE(acc, connection).get("original_files", [])
        new_files = {}
        old_files = {}
        for f in files:
            file = encodedcc.get_ENCODE(f, connection)
            #renamer(file, connection, args.update)
            #replacer(file, connection, args.update)
            if any(file.get("aliases", [])):
                # this has aliases
                if file["aliases"][0].endswith("_replaced"):
                    # this is one of the old ones
                    dict_maker(file, old_files)
                else:
                    # this is a new file
                    dict_maker(file, new_files)
            else:
                print("file {} has no aliases".format(file["@id"]))

        for new in new_files.keys():
            new_temp = new_files[new]
            for old in old_files.keys():
                old_temp = old_files[old]

                if new_temp["replicate"] == old_temp["replicate"]:
                    #print(new_temp["replicate"], old_temp["replicate"])

                    if new_temp["file_type"] == old_temp["file_type"]:
                        #print(new_temp["file_type"], old_temp["file_type"])

                        if new_temp["run_type"] == old_temp["run_type"]:
                            #print(new_temp["run_type"], old_temp["run_type"])

                            if new_temp["paired_end"] == old_temp[
                                    "paired_end"]:
                                #print(new_temp["paired_end"], old_temp["paired_end"])
                                print(
                                    "New file {} with date {} replacing old file {} with date {}"
                                    .format(new, new_temp["date"], old,
                                            old_temp["date"]))
                                if args.update:
                                    #replace old file
                                    encodedcc.patch_ENCODE(
                                        old, connection,
                                        {"status": "replaced"})
                                    # release and update new file
                                    patch_dict = {
                                        "status": "released",
                                        "alternate_accessions": [old]
                                    }
                                    encodedcc.patch_ENCODE(
                                        new, connection, patch_dict)
コード例 #27
0
def main():
    print("This script outputs a 'No Results Found' error.")
    print("This is due to the Long/Short RNA-seq, it does not affect the final results")
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    search_string = "/matrix/?type=Experiment"
    rfa_string = ""
    species_string = ""
    status_string = ""
    lab_string = ""
    if args.rfa:
        rfa_list = args.rfa.split(",")
        for r in rfa_list:
            rfa_string += "&award.rfa=" + r
    if args.species:
        species_list = args.species.split(",")
        for r in species_list:
            species_string += "&replicates.library.biosample.donor.organism.name=" + r
    if args.status:
        status_list = args.status.split(",")
        for r in status_list:
            status_string += "&status=" + r
    if args.lab:
        lab_list = args.lab.split(",")
        for r in lab_list:
            lab_string += "&lab.name=" + r
    full_string = rfa_string + species_string + status_string + lab_string
    search_string += full_string
    matrix_url = '=HYPERLINK("{}","{}")'.format(connection.server + search_string, connection.server + search_string)

    matrix = encodedcc.get_ENCODE(search_string, connection).get("matrix")
    x_values = matrix.get("x")
    y_values = matrix.get("y")

    y_buckets = y_values["replicates.library.biosample.biosample_type"].get("buckets")
    x_buckets = x_values.get("buckets")
    if args.all:
        full_list = []
        for x in x_buckets:
            full_list.append(x["key"])
    else:
        full_list = ["RNA-seq", "microRNA profiling by array assay", "microRNA-seq", "DNase-seq", "whole-genome shotgun bisulfite sequencing", "RAMPAGE", "CAGE"]
    temp_list = list(full_list)
    if "RNA-seq" in temp_list:
        temp_list.remove("RNA-seq")
    headers = [matrix_url] + ["Long RNA-seq", "Short RNA-seq"] + temp_list + ["TOTAL"]

    final_assay_search = ""  # this will be used to total rows
    for name in full_list:
        final_assay_search += "&assay_term_name=" + name
    final_bio_search = ""  # this will be used to total columns

    col_dict = dict.fromkeys(headers)
    for k in col_dict.keys():
        col_dict[k] = []
    with open(args.outfile, "w") as tsvfile:
        dictwriter = csv.DictWriter(tsvfile, delimiter="\t", fieldnames=headers)
        dictwriter.writeheader()
        for y in y_buckets:
            inner_buckets = y["biosample_term_name"].get("buckets")
            group_dict = dict.fromkeys(headers)
            group_dict[matrix_url] = y["key"]
            dictwriter.writerow(group_dict)
            for item in inner_buckets:
                bio_name = item["key"]
                final_bio_search += "&biosample_term_name=" + quote(bio_name)
                assay_list = item["assay_term_name"]
                row_dict = dict.fromkeys(headers)
                for k in row_dict.keys():
                    row_dict[k] = 0
                row_dict[matrix_url] = bio_name
                row_count = []
                for x in range(len(assay_list)):
                    assay_name = x_buckets[x]["key"]
                    if assay_name in full_list:
                        if assay_list[x] > 0:
                            search = "/search/?type=Experiment&biosample_term_name=" + quote(bio_name) + "&assay_term_name=" + assay_name + full_string
                            if assay_name == "RNA-seq":
                                short_search = search + "&replicates.library.size_range=<200"
                                long_search = search + "&replicates.library.size_range!=<200"

                                short_url = connection.server + short_search
                                long_url = connection.server + long_search

                                short_facets = encodedcc.get_ENCODE(short_search, connection)
                                long_facets = encodedcc.get_ENCODE(long_search, connection)

                                if short_facets.get("total") == 0:
                                    row_dict["Short RNA-seq"] = 0
                                    row_count.append([0, 0, 0, 0, 0])
                                    col_dict["Short RNA-seq"].append([0, 0, 0, 0, 0])
                                else:
                                    total, error, not_compliant, warning, dcc_action = audit_count(short_facets.get("facets", []), short_facets.get("total"), args.allaudits)
                                    if args.allaudits:
                                        string = '=HYPERLINK("{}","{}, {}E, {}NC, {}W, {}DCC")'.format(short_url, total, error, not_compliant, warning, dcc_action)
                                    else:
                                        string = '=HYPERLINK("{}","{}, {}E, {}NC")'.format(short_url, total, error, not_compliant)
                                    row_dict["Short RNA-seq"] = string
                                    row_count.append([total, error, not_compliant, warning, dcc_action])
                                    col_dict["Short RNA-seq"].append([total, error, not_compliant, warning, dcc_action])

                                if long_facets.get("total") == 0:
                                    row_dict["Long RNA-seq"] = 0
                                    row_count.append([0, 0, 0, 0, 0])
                                    col_dict["Long RNA-seq"].append([0, 0, 0, 0, 0])
                                else:
                                    total, error, not_compliant, warning, dcc_action = audit_count(long_facets.get("facets", []), long_facets.get("total"), args.allaudits)
                                    if args.allaudits:
                                        string = '=HYPERLINK("{}","{}, {}E, {}NC, {}W, {}DCC")'.format(long_url, total, error, not_compliant, warning, dcc_action)
                                    else:
                                        string = '=HYPERLINK("{}","{}, {}E, {}NC")'.format(long_url, total, error, not_compliant)
                                    row_dict["Long RNA-seq"] = string
                                    row_count.append([total, error, not_compliant, warning, dcc_action])
                                    col_dict["Long RNA-seq"].append([total, error, not_compliant, warning, dcc_action])
                            else:
                                url = connection.server + search
                                facets = encodedcc.get_ENCODE(search, connection).get("facets", [])
                                total, error, not_compliant, warning, dcc_action = audit_count(facets, assay_list[x], args.allaudits)
                                if args.allaudits:
                                    string = '=HYPERLINK("{}","{}, {}E, {}NC, {}W, {}DCC")'.format(url, total, error, not_compliant, warning, dcc_action)
                                else:
                                    string = '=HYPERLINK("{}","{}, {}E, {}NC")'.format(url, total, error, not_compliant)
                                row_count.append([total, error, not_compliant, warning, dcc_action])
                                row_dict[assay_name] = string
                                col_dict[assay_name].append([total, error, not_compliant, warning, dcc_action])
                        else:
                            if assay_name == "RNA-seq":
                                row_dict["Short RNA-seq"] = 0
                                row_dict["Long RNA-seq"] = 0
                                col_dict["Short RNA-seq"].append([0, 0, 0, 0, 0])
                                col_dict["Long RNA-seq"].append([0, 0, 0, 0, 0])
                            else:
                                row_dict[assay_name] = 0
                                col_dict[assay_name].append([0, 0, 0, 0, 0])
                            row_count.append([0, 0, 0, 0, 0])

                row_total = 0
                row_error = 0
                row_not_compliant = 0
                row_warning = 0
                row_dcc_action = 0
                bio_total = "/search/?type=Experiment&biosample_term_name=" + quote(bio_name) + final_assay_search + full_string
                bio_url = connection.server + bio_total
                for col in row_count:
                        row_total += col[0]
                        row_error += col[1]
                        row_not_compliant += col[2]
                        row_warning += col[3]
                        row_dcc_action += col[4]
                if args.allaudits:
                    row_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC, {}W, {}DCC")'.format(bio_url, row_total, row_error, row_not_compliant, row_warning, row_dcc_action)
                else:
                    row_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC")'.format(bio_url, row_total, row_error, row_not_compliant)
                dictwriter.writerow(row_dict)
        total = 0
        error = 0
        not_compliant = 0
        warning = 0
        dcc_action = 0
        total_dict = dict.fromkeys(headers)
        total_dict[matrix_url] = "TOTALS"
        for key in col_dict.keys():
            if key in headers and key != matrix_url:
                col_total = 0
                col_error = 0
                col_not_compliant = 0
                col_warning = 0
                col_dcc_action = 0
                for cell in col_dict[key]:
                    col_total += cell[0]
                    col_error += cell[1]
                    col_not_compliant += cell[2]
                    col_warning += cell[3]
                    col_dcc_action += cell[4]
                total += col_total
                error += col_error
                not_compliant += col_not_compliant
                warning += col_warning
                dcc_action += col_dcc_action
                if key == "Long RNA-seq":
                    assay_total = "/search/?type=Experiment&assay_term_name=RNA-seq&replicates.library.size_range!=<200" + final_bio_search + full_string
                elif key == "Short RNA-seq":
                    assay_total = "/search/?type=Experiment&assay_term_name=RNA-seq&replicates.library.size_range=<200" + final_bio_search + full_string
                else:
                    assay_total = "/search/?type=Experiment&assay_term_name=" + key + final_bio_search + full_string
                assay_url = connection.server + assay_total
                if args.allaudits:
                    total_dict[key] = '=HYPERLINK("{}", "{}, {}E, {}NC, {}W, {}DCC")'.format(assay_url, col_total, col_error, col_not_compliant, col_warning, col_dcc_action)
                else:
                    total_dict[key] = '=HYPERLINK("{}", "{}, {}E, {}NC")'.format(assay_url, col_total, col_error, col_not_compliant)
        full_search = "/search/?type=Experiment" + final_assay_search + final_bio_search + full_string
        full_url = connection.server + full_search
        if args.allaudits:
            total_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC, {}W, {}DCC")'.format(full_url, total, error, not_compliant, warning, dcc_action)
        else:
            total_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC")'.format(full_url, total, error, not_compliant)
        dictwriter.writerow(total_dict)

    print("Output saved to {}, open this file with Google Docs Sheets, don't use Excel because it sucks".format(args.outfile))
コード例 #28
0
def test_get():
    key = encodedcc.ENC_Key(keypairs, "default")
    connection = encodedcc.ENC_Connection(key)
    result = encodedcc.get_ENCODE("/profiles/", connection)
    assert(type(result) is dict)
コード例 #29
0
ファイル: one_ENCODE.py プロジェクト: mmmika/pyencoded-tools
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument(
        '--infile',
        '-i',
        help="File containing the JSON object as a JSON string.")
    parser.add_argument('--server', help="Full URL of the server.")
    parser.add_argument('--key',
                        default='default',
                        help="The keypair identifier from the keyfile.  \
                        Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --keyfile\
                        =%s" % (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--authid', help="The HTTP auth ID.")
    parser.add_argument('--authpw', help="The HTTP auth PW.")
    parser.add_argument(
        '--force-put',
        default=False,
        action='store_true',
        help="Force the object to be PUT rather than PATCHed.  \
                        Default is False.")
    parser.add_argument('--get-only',
                        default=False,
                        action='store_true',
                        help="Do nothing but get the object and print it.  \
                        Default is False.")
    parser.add_argument('--id', help="URI for an object"),
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")
    parser.add_argument(
        '--frame',
        help=
        "define a frame to get back the JSON object, for use with --id. Default is frame=object",
        default="object")
    parser.add_argument('--type', help="the object's type")
    parser.add_argument(
        '--update',
        default=False,
        action='store_true',
        help="Let the script PATCH/POST the data.  Default is False")
    args = parser.parse_args()

    global DEBUG_ON
    DEBUG_ON = args.debug

    if args.get_only:
        GET_ONLY = True
    else:
        GET_ONLY = False

    key = encodedcc.ENC_Key(args.keyfile, args.key)
    if args.server and args.authpw and args.authid:
        key.server = args.server
        key.authid = args.authid
        key.authpw = args.authpw
        print("Creating authorization data from command line inputs")
    connection = encodedcc.ENC_Connection(key)
    print("Running on {}".format(connection.server))
    if args.update:
        print(
            "This is an UPDATE run! Data will be PATCHed or POSTed accordingly"
        )
    else:
        print("This is a dry run, no data will be changed")

    new_object = False
    if args.id:
        GET_ONLY = True
        print("Taking id to get from --id")
        new_json = {}
        uuid_response = {}
        accession_response = {}
        try:
            id_response = encodedcc.get_ENCODE(args.id,
                                               connection,
                                               frame=args.frame)
        except:
            id_response = {}
            new_object = True
    else:

        if args.infile:
            infile = open(args.infile, 'r')
        else:
            infile = sys.stdin

        new_json_string = infile.read()

        new_json = json.loads(new_json_string)
        if args.debug:
            encodedcc.pprint_ENCODE(new_json)
        if '@id' in new_json:
            id_response = encodedcc.get_ENCODE(new_json['@id'], connection)
            if id_response.get("code") == 404:
                id_response = {}
                new_object = True
        else:
            id_response = {}
            new_object = True
        if 'uuid' in new_json:
            uuid_response = encodedcc.get_ENCODE(new_json['uuid'], connection)
            if uuid_response.get("code") == 404:
                uuid_response = {}
                new_object = True
        else:
            uuid_response = {}
            new_object = True
        if 'accession' in new_json:
            accession_response = encodedcc.get_ENCODE(new_json['accession'],
                                                      connection)
            if accession_response.get("code") == 404:
                accession_response = {}
                new_object = True
        else:
            accession_response = {}
            new_object = True

        if new_object:
            print(
                "No identifier in new JSON object.  Assuming POST or PUT with auto-accessioning."
            )

    object_exists = False
    if id_response:
        object_exists = True
        print("Found matching @id:")
        encodedcc.pprint_ENCODE(id_response)
    if uuid_response:
        object_exists = True
        print("Found matching uuid:")
        encodedcc.pprint_ENCODE(uuid_response)
    if accession_response:
        object_exists = True
        print("Found matching accession")
        encodedcc.pprint_ENCODE(accession_response)

    if id_response and uuid_response and (id_response != uuid_response):
        print("Existing id/uuid mismatch")
    if id_response and accession_response and (id_response !=
                                               accession_response):
        print("Existing id/accession mismatch")
    if uuid_response and accession_response and (uuid_response !=
                                                 accession_response):
        print("Existing uuid/accession mismatch")

    if new_object and object_exists:
        print(
            "Conflict:  At least one identifier already exists and at least one does not exist"
        )

    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    if "Dataset" not in supported_collections:
        supported_collections.append("Dataset")

    type_list = new_json.pop('@type', [])
    if args.type:
        type_list = [args.type]
    if any(type_list):
        findit = False
        for x in supported_collections:
            if x.lower() == type_list[0].lower():
                type_list = [x]
                findit = True
        if findit:
            if args.debug:
                print("Object will have type of", type_list[0])
        else:
            print(
                "Error! JSON object does not contain one of the supported types"
            )
            print("Provided type:", type_list[0])
            print(
                "Please either change the JSON file or define the type with the --type feature"
            )
            sys.exit(1)
    else:
        print("No type found for JSON object!")
        sys.exit(1)

    possible_collections = [x for x in type_list if x in supported_collections]
    if possible_collections:
        # collection = possible_collections[0] + 's/'
        collection = possible_collections[0]
    else:
        collection = []
    if '@id' in new_json:
        identifier = new_json.pop('@id')
    elif 'uuid' in new_json:
        if collection:
            identifier = '/' + collection + '/' + new_json['uuid'] + '/'
        else:
            identifier = '/' + new_json['uuid'] + '/'
    elif 'accession' in new_json:
        if collection:
            identifier = '/' + collection + '/' + new_json['accession'] + '/'
        else:
            identifier = '/' + new_json['accession'] + '/'
    if 'attachment' in new_json:
        if 'href' in new_json['attachment']:
            pass
        else:
            try:
                filename = new_json['attachment']['download']
                print("Setting filename to %s" % (filename))
            except:
                print("Must specify either href or filename for attachment",
                      file=sys.stderr)
            if new_json['attachment'].get('type'):
                mime_type = new_json['attachment'].get('type')
            else:
                try:
                    mime_type, encoding = mimetypes.guess_type(filename)
                    major, minor = mime_type.split('/')
                    #detected_type = magic.from_file(filename, mime=True)
                    print("Detected mime type %s" % (mime_type))
                except:
                    print("Failed to detect mime type in file %s" % (filename),
                          file=sys.stderr)
            try:
                with open(filename, 'rb') as stream:
                    print("opened")
                    newvalue = {
                        'download':
                        filename,  # Just echoes the given filename as the download name
                        'type':
                        mime_type,
                        'href':
                        'data:%s;base64,%s' %
                        (mime_type, b64encode(stream.read()))
                    }
                f = open('tmp', 'w')
                print(f, newvalue)
                new_json.update({'attachment': newvalue})  # add
            except:
                print("Cannot open file %s" % (filename), file=sys.stderr)
    if object_exists:
        if args.force_put:
            if not GET_ONLY:
                print("Replacing existing object")
                if args.update:
                    e = encodedcc.replace_ENCODE(identifier, connection,
                                                 new_json)
                    print(e)
        else:
            if not GET_ONLY:
                print("PATCHing existing object")
                if args.update:
                    e = encodedcc.patch_ENCODE(identifier, connection,
                                               new_json)
                    print(e)
    elif new_object:
        if args.force_put:
            if not GET_ONLY:
                print("PUT'ing new object")
                if args.update:
                    e = encodedcc.replace_ENCODE(identifier, connection,
                                                 new_json)
                    print(e)
        else:
            if not GET_ONLY:
                print("POST'ing new object")
                if not any(collection):
                    print(
                        "ERROR: Unable to POST to non-existing collection {}".
                        format(collection))
                    sys.exit(1)
                if args.update:
                    e = encodedcc.new_ENCODE(connection, collection, new_json)
                    print(e)
コード例 #30
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    keypair = (key.authid, key.authpw)
    server = key.server
    connection = encodedcc.ENC_Connection(key)

    lab = '&lab.name=' + args.lab
    organism = '&replicates.library.biosample.donor.organism.scientific_name=' + \
               args.organism

    histone_experiments_pages = encoded_get(
        server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq'
        '&award.rfa=ENCODE3' + organism + '&target.investigated_as=' +
        args.target + lab + '&format=json&frame=' + 'page&limit=all',
        keypair)['@graph']
    print("retreived " + str(len(histone_experiments_pages)) +
          " experiment pages")

    histone_controls_pages = encoded_get(
        server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq'
        '&award.rfa=ENCODE3' + organism + '&target.investigated_as=control' +
        lab + '&format=json&frame=' + 'page&limit=all', keypair)['@graph']
    print("retreived " + str(len(histone_controls_pages)) + " control pages")

    histone_experiments_objects = encoded_get(
        server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq'
        '&award.rfa=ENCODE3' + organism + '&target.investigated_as=' +
        args.target + lab + '&format=json&frame=' + 'embedded&limit=all',
        keypair)['@graph']
    print("retreived " + str(len(histone_experiments_objects)) +
          " experiment objects")

    histone_controls_objects = encoded_get(
        server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq'
        '&award.rfa=ENCODE3' + organism + '&target.investigated_as=control' +
        lab + '&format=json&frame=' + 'embedded&limit=all', keypair)['@graph']
    print("retreived " + str(len(histone_controls_objects)) +
          " control objects")

    matrix = {}
    control_matrix = {}
    sample_types = set()
    marks = set()
    histone_experiments_dict = {}
    for entry in histone_experiments_pages:
        histone_experiments_dict[entry['accession']] = {'page': entry}
    for entry in histone_experiments_objects:
        histone_experiments_dict[entry['accession']]['object'] = entry
        sample = entry['biosample_term_name']
        mark = entry['target']['label']
        if mark not in matrix:
            matrix[mark] = {}
        if sample not in matrix[mark]:
            matrix[mark][sample] = []

        if 'aliases' in entry:
            matrix[mark][sample].append((entry['accession'], entry['aliases']))
        else:
            matrix[mark][sample].append((entry['accession'], 'NO ALIASES'))
        sample_types.add(sample)
        marks.add(mark)

    histone_controls_dict = {}
    for entry in histone_controls_pages:
        histone_controls_dict[entry['accession']] = {'page': entry}
    for entry in histone_controls_objects:
        histone_controls_dict[entry['accession']]['object'] = entry

        sample = entry['biosample_term_name']
        mark = 'control'
        if mark not in control_matrix:
            control_matrix[mark] = {}
        if sample not in control_matrix[mark]:
            control_matrix[mark][sample] = []

        if 'aliases' in entry:
            control_matrix[mark][sample].append(
                (entry['accession'], entry['aliases']))
        else:
            control_matrix[mark][sample].append(
                (entry['accession'], 'NO ALIASES'))
        sample_types.add(sample)
        marks.add(mark)

    mone = 0
    for ac in histone_experiments_dict:
        page = histone_experiments_dict[ac]['page']
        obj = histone_experiments_dict[ac]['object']
        mone += 1
        #  check only experiments that are not DELETED/REVOKED/REPLACED
        if is_interesting(obj):
            if mone % 10 == 0:
                print('processed ' + str(mone) + ' out of ' +
                      str(len(histone_experiments_dict.keys())))

            statuses = {
                'replication': [],
                'antibody': [],
                'control': [],
                'files': [],
                'qc': []
            }
            if is_replicated(obj) is False or is_replicated(page) is False:
                statuses['replication'].append('unreplicated')
            if is_antibody_eligible(page) is False:
                statuses['antibody'].append('not eligible antybody')
            if is_not_missing_antibody(page) is False:
                statuses['antibody'].append('missing antybody')
            if is_not_mismatched_control(page) is False:
                statuses['control'].append('mismatched controled_by')
            if is_not_mismatched_control_run_type(page) is False:
                statuses['control'].append('mismatched controled_by run_type')
            if is_not_mismatched_control_read_length(page) is False:
                statuses['control'].append(
                    'mismatched controled_by read_length')
            if is_not_missing_controls(page) is False:
                statuses['control'].append('missing control')
            if is_not_missing_paired_with(page) is False:
                statuses['files'].append('missing paired_with files')
            if is_sufficient_read_depth(page) is False:
                statuses['qc'].append('insufficient read depth')
            if is_not_low_read_depth(page) is False:
                statuses['qc'].append('low read depth')
            if is_compliant_library_complexity(page) is False:
                statuses['qc'].append('insufficient/poor library complexity')
            if is_not_moderate_library_complexity(page) is False:
                statuses['qc'].append('moderate library complexity')
            if is_compliant_library_bottlenecking(page) is False:
                statuses['qc'].append('severe/moderate library bottlenecking')
            if is_not_mild_library_bottlenecking(page) is False:
                statuses['qc'].append('mild library bottlenecking')

            if is_not_missing_controls(page) is True and \
               is_not_mismatched_control(page) is True:
                not_encode_3_flag = False
                for entry in obj['possible_controls']:
                    control_accession = entry['accession']
                    if control_accession in histone_controls_dict:
                        control_page = histone_controls_dict[
                            control_accession]['page']
                        if is_sufficient_read_depth(control_page) is False:
                            statuses['control'].append('insufficient read '
                                                       'depth in control')
                        if is_not_low_read_depth(control_page) is False:
                            statuses['control'].append('low read '
                                                       'depth in control')
                        if is_compliant_library_complexity(
                                control_page) is False:
                            statuses['control'].append('insufficient/poor '
                                                       'library '
                                                       'complexity in control')
                        if is_not_moderate_library_complexity(
                                control_page) is False:
                            statuses['control'].append('moderate library '
                                                       'complexity in control')
                        if is_compliant_library_bottlenecking(
                                control_page) is False:
                            statuses['control'].append(
                                'severe/moderate library '
                                'bottlenecking in control')
                        if is_not_mild_library_bottlenecking(
                                control_page) is False:
                            statuses['control'].append(
                                'mild library '
                                'bottlenecking in control')

                    else:
                        not_encode_3_flag = True
                if (not_encode_3_flag is True):
                    statuses['control'].append('non ENCODE3 control')

            histone_experiments_dict[ac]['statuses'] = statuses

            rep_dict = {}
            for file_id in obj['original_files']:
                file_object = encodedcc.get_ENCODE(
                    file_id.split('/')[2], connection, 'embedded')
                if file_object['status'] in FILE_IGNORE_STATUS:
                    continue
                if file_object['file_format'] == 'fastq':
                    if 'replicate' in file_object:
                        bio_rep_number = file_object['replicate'][
                            'biological_replicate_number']
                        tec_rep_number = file_object['replicate'][
                            'technical_replicate_number']
                        key = (bio_rep_number, tec_rep_number)
                        if key not in rep_dict:
                            rep_dict[key] = set()
                        if 'read_length' in file_object and 'run_type' in file_object:
                            if file_object['run_type'] == 'single-ended':
                                record_val = str(
                                    file_object['read_length']) + 'SE'
                            else:
                                record_val = str(
                                    file_object['read_length']) + 'PE'
                            rep_dict[key].add(record_val)
            seq_info_string = ''
            for k in sorted(rep_dict.keys()):
                reps_string = ''
                for member in rep_dict[k]:
                    reps_string += member + ', '
                seq_info_string += 'REP' + str(k[0]) + '.' + str(
                    k[1]) + ' ' + reps_string[:-2] + '\r'

            histone_experiments_dict[ac]['seq_info'] = seq_info_string

    mone = 0
    for ac in histone_controls_dict:
        mone += 1

        page = histone_controls_dict[ac]['page']
        obj = histone_controls_dict[ac]['object']
        if is_interesting(obj):
            if mone % 10 == 0:
                print('processed ' + str(mone) + ' out of ' +
                      str(len(histone_controls_dict.keys())))
            statuses = {'replication': [], 'files': [], 'qc': []}
            if is_replicated(obj) is False or is_replicated(page) is False:
                statuses['replication'].append('unreplicated')
            if is_not_missing_paired_with(page) is False:
                statuses['files'].append('missing paired_with files')
            if is_sufficient_read_depth(page) is False:
                statuses['qc'].append('insufficient read depth')
            if is_not_low_read_depth(page) is False:
                statuses['qc'].append('low read depth')
            if is_compliant_library_complexity(page) is False:
                statuses['qc'].append('insufficient/poor library complexity')
            if is_not_moderate_library_complexity(page) is False:
                statuses['qc'].append('moderate library complexity')
            if is_compliant_library_bottlenecking(page) is False:
                statuses['qc'].append('severe/moderate library bottlenecking')
            if is_not_mild_library_bottlenecking(page) is False:
                statuses['qc'].append('mild library bottlenecking')

        histone_controls_dict[ac]['statuses'] = statuses
        rep_dict = {}
        for file_id in obj['original_files']:
            file_object = encodedcc.get_ENCODE(
                file_id.split('/')[2], connection, 'embedded')
            if file_object['status'] in FILE_IGNORE_STATUS:
                continue
            if file_object['file_format'] == 'fastq':
                if 'replicate' in file_object:
                    bio_rep_number = file_object['replicate'][
                        'biological_replicate_number']
                    tec_rep_number = file_object['replicate'][
                        'technical_replicate_number']
                    key = (bio_rep_number, tec_rep_number)
                    if key not in rep_dict:
                        rep_dict[key] = set()
                    if 'read_length' in file_object and 'run_type' in file_object:
                        if file_object['run_type'] == 'single-ended':
                            record_val = str(file_object['read_length']) + 'SE'
                        else:
                            record_val = str(file_object['read_length']) + 'PE'
                        rep_dict[key].add(record_val)
        seq_info_string = ''
        for k in sorted(rep_dict.keys()):
            reps_string = ''
            for member in rep_dict[k]:
                reps_string += member + ', '
            seq_info_string += 'REP' + str(k[0]) + '.' + str(
                k[1]) + ' ' + reps_string[:-2] + '\r'
        histone_controls_dict[ac]['seq_info'] = seq_info_string

    if args.target == "histone":

        marks_to_print = ['control']
        marks_to_print.extend(CORE_MARKS)
        for m in marks:
            if m not in CORE_MARKS and m != 'control':
                marks_to_print.append(m)
    else:
        marks_to_print = ['control']
        for m in marks:
            if m != 'control':
                marks_to_print.append(m)

    with open(args.audit_matrix, 'w') as output:
        fields = ['sample'] + marks_to_print
        writer = csv.DictWriter(output, fieldnames=fields)
        writer.writeheader()
        for sample in sample_types:
            row = {'sample': sample}

            for mark in marks_to_print:
                if mark != 'control':
                    if sample in matrix[mark]:
                        total = len(matrix[mark][sample])

                        accessionStatuses = {}
                        aliases = {}
                        for (acc, al) in matrix[mark][sample]:
                            aliases[acc] = al
                            accessionStatuses[acc] = []
                            statuses = histone_experiments_dict[acc][
                                'statuses']

                            for k in statuses:
                                if len(statuses[k]) > 0:
                                    statuses_string = ''
                                    for status in statuses[k]:
                                        statuses_string += '-' + status + '\r'
                                    accessionStatuses[acc].append(
                                        statuses_string)
                        cell_info = ''
                        for acc in accessionStatuses:
                            if len(accessionStatuses[acc]) < 1:

                                cell_info += acc + ' ' + histone_experiments_dict[acc]['object']['status'] + \
                                                   '\r' + str(aliases[acc])

                            else:
                                statuses_string = ''
                                for status in accessionStatuses[acc]:
                                    statuses_string += status
                                cell_info += acc + ' ' + histone_experiments_dict[acc]['object']['status'] + \
                                                   '\r' + str(aliases[acc]) + '\r' + \
                                                   statuses_string
                            cell_info += '\r\n'
                        row.update({
                            mark:
                            'Experiments number : ' + str(total) + '\r' +
                            cell_info
                        })
                    else:
                        row.update({mark: 'NONE'})
                else:
                    if sample in control_matrix[mark]:
                        total = len(control_matrix[mark][sample])

                        accessionStatuses = {}
                        aliases = {}
                        for (acc, al) in control_matrix[mark][sample]:
                            aliases[acc] = al
                            accessionStatuses[acc] = []
                            statuses = histone_controls_dict[acc]['statuses']

                            for k in statuses:
                                if len(statuses[k]) > 0:
                                    statuses_string = ''
                                    for status in statuses[k]:
                                        statuses_string += '-' + status + '\r'
                                    accessionStatuses[acc].append(
                                        statuses_string)
                        cell_info = ''
                        for acc in accessionStatuses:
                            if len(accessionStatuses[acc]) < 1:
                                cell_info += acc + ' ' + histone_controls_dict[acc]['object']['status'] + \
                                                   '\r' + str(aliases[acc])
                            else:
                                statuses_string = ''
                                for status in accessionStatuses[acc]:
                                    statuses_string += status
                                cell_info += acc + ' ' + histone_controls_dict[acc]['object']['status'] + \
                                                   '\r' + str(aliases[acc]) + '\r' + \
                                                   statuses_string
                            cell_info += '\r\n'
                        row.update({
                            mark:
                            'Experiments number : ' + str(total) + '\r' +
                            cell_info
                        })
                    else:
                        row.update({mark: 'NONE'})

            writer.writerow(row)

    with open(args.run_type_matrix, 'w') as output:
        fields = ['sample'] + marks_to_print
        writer = csv.DictWriter(output, fieldnames=fields)
        writer.writeheader()
        for sample in sample_types:
            row = {'sample': sample}

            for mark in marks_to_print:
                if mark != 'control':
                    if sample in matrix[mark]:
                        total = len(matrix[mark][sample])

                        accessionStatuses = {}
                        aliases = {}
                        for (acc, al) in matrix[mark][sample]:
                            aliases[acc] = al
                            accessionStatuses[acc] = []
                            statuses = histone_experiments_dict[acc][
                                'statuses']

                            for k in statuses:
                                if len(statuses[k]) > 0:
                                    statuses_string = ''
                                    for status in statuses[k]:
                                        statuses_string += '-' + status + '\r'
                                    accessionStatuses[acc].append(
                                        statuses_string)
                        cell_info = ''
                        for acc in accessionStatuses:
                            cell_info += acc + ' ' + \
                                               histone_experiments_dict[acc]['object']['status'] + \
                                               '\r' + str(aliases[acc]) + \
                                               '\r' + \
                                               histone_experiments_dict[acc]['seq_info']

                            cell_info += '\r\n'
                        row.update({
                            mark:
                            'Experiments number : ' + str(total) + '\r' +
                            cell_info
                        })
                    else:
                        row.update({mark: 'NONE'})
                else:
                    if sample in control_matrix[mark]:
                        total = len(control_matrix[mark][sample])

                        accessionStatuses = {}
                        aliases = {}
                        for (acc, al) in control_matrix[mark][sample]:
                            aliases[acc] = al
                            accessionStatuses[acc] = []
                            statuses = histone_controls_dict[acc]['statuses']

                            for k in statuses:
                                if len(statuses[k]) > 0:
                                    statuses_string = ''
                                    for status in statuses[k]:
                                        statuses_string += '-' + status + '\r'
                                    accessionStatuses[acc].append(
                                        statuses_string)
                        cell_info = ''
                        for acc in accessionStatuses:

                            cell_info += acc + ' ' + histone_controls_dict[acc]['object']['status'] + \
                                               '\r' + str(aliases[acc]) + '\r' + \
                                               histone_controls_dict[acc]['seq_info']

                            cell_info += '\r\n'
                        row.update({
                            mark:
                            'Experiments number : ' + str(total) + '\r' +
                            cell_info
                        })
                    else:
                        row.update({mark: 'NONE'})

            writer.writerow(row)