Exemple #1
0
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument(
        '-s',
        '--server',
        required=True,
        choices=['www.encodeproject.org', 'test.encodedcc.org'],
        help='DCC Server to upload to')
    parser.add_argument('-m',
                        '--metadata',
                        required=True,
                        help='Metadata spreadsheet to use')
    parser.add_argument('-f',
                        '--flowcell-details',
                        required=True,
                        help='Flowcel metadata details')
    parser.add_argument('-n', '--dry-run', action='store_true', default=False)
    args = parser.parse_args(cmdline)

    logging.basicConfig(level=logging.INFO)

    logging.info('Server: %s', args.server)
    logging.info('Sheetname: %s', args.metadata)
    server = ENCODED(args.server)
    server.load_netrc()

    book = ODFReader(args.metadata)
    process_fastqs(server, book, args.flowcell_details, args.dry_run)
def encoded_experiment_loader(query_url, experiments=None):
    server = ENCODED('www.encodeproject.org')
    server.load_netrc()

    if experiments is None:
        experiments = {}

    query = server.get_json(query_url)
    tzero = time.monotonic()
    tnow = tzero
    tprev = tzero
    progress = len(query['@graph']) // 10
    for i, record in enumerate(query['@graph']):
        accession = record['@id'][len('/experiments/'):-1]
        if accession not in experiments:
            experiments[accession] = server.get_json(record['@id'])

        if progress != 0 and (i+1) % progress  == 0:
            tnow = time.monotonic()
            print("Reading {} of {} records in {} seconds".format(
                  (i+1),
                  len(query['@graph']),
                  tnow - tprev))
            tprev = tnow
    print("Read {} records in {} seconds".format(
        len(query['@graph']), tnow-tzero))

    return experiments
Exemple #3
0
    def setUp(self):
        self.encode = ENCODED('www.encodeproject.org')
        self.encode._user = {
            '@context': '/terms/',
            '@id': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/',
            '@type': ['User', 'Item'],
            'first_name': 'Diane',
            'groups': [],
            'job_title': 'Submitter',
            'lab': {
                '@id': '/labs/barbara-wold/',
                '@type': ['Lab', 'Item'],
                'country': 'USA',
                'institute_label': 'Caltech',
                'institute_name': 'California Institute of Technology',
                'pi': '/users/0598c868-0b4a-4c5b-9112-8f85c5de5374/',
                'schema_version': '4',
                'title': 'Barbara Wold, Caltech',
                'uuid': '72d5666a-a361-4f7b-ab66-a88e11280937'
            },
            'last_name': 'Trout',
            'schema_version': '5',
            'submits_for': ['/labs/barbara-wold/',
                            '/labs/richard-myers/',
                            '/labs/ali-mortazavi/'],
            'uuid': 'bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a',
            }

        logging.disable(logging.WARNING)
        self.validator = DCCValidator(self.encode)
        for schema, filename in [('library', 'library.json'),
                                 ('biosample', 'biosample.json')]:
            schema_file = os.path.join(os.path.dirname(__file__), filename)
            with open(schema_file, 'rt') as instream:
                self.validator._schemas[schema] = json.load(instream)
Exemple #4
0
def main():
    data_id = 'ENCSR574CRQ'
    #test_id = 'TSTSR910688'

    server = ENCODED('www.encodeproject.org')
    #server = ENCODED('test.encodedcc.org')
    server.load_netrc()

    experiments = {}
    for e in load_experiments_from_ods(
            'C1-encode3-limb-tranche1-resubmit.ods'):
        experiments['/experiments/{}/'.format(e)] = None
    for e in load_experiments_from_ods(
            'C1-mouse-forlimb-submission-201804.ods'):
        experiments['/experiments/{}/'.format(e)] = None

    df = pandas.read_excel(
        'Mouse embryo samples list library numbers Diane August 21 2017.xlsx',
        sheet='Sheet 1',
        usecols=[0, 1, 2, 3],
        dtype={
            2: str,
            3: str
        })
    df = df.dropna()
    print(df)

    award = '/awards/UM1HG009443/'
    lab = '/labs/barbara-wold/'

    # load bulk list
    for l in df['Library number']:
        graph = server.search_jsonld(searchTerm='barbara-wold:{}'.format(l))
        for result in graph['@graph']:
            if 'Experiment' in result['@type']:
                experiment = server.get_json(result['@id'])
                if experiment['status'] == 'released':
                    experiments[result['@id']] = None

    pub_id = '/publications/e0d01543-9965-4edb-933c-778a40575cd9/'
    pub = server.get_json(pub_id)
    dataset = pub.get('datasets', [])
    dataset = [x['@id'] for x in dataset]
    #for d in dataset:
    #    if d in experiments:
    #        del experiments[d]
    experiments = list(experiments)
    print('posted', len(dataset))
    print('update', len(experiments))
    pprint(list(experiments))
    print(type(experiments))
    print(type(dataset))
    print(set(experiments).difference(set(dataset)))
    print(server.patch_json(pub_id, {'datasets': experiments}))
def main():
    server = ENCODED('www.encodeproject.org')
    server.load_netrc()

    to_include = [x.replace('_mm10', '') for x in generate_to_include()]

    found = []
    for lib in to_include[1:]:
        alias = 'barbara-wold:' + lib
        try:
            obj = server.get_json(alias)
        except HTTPError as e:
            print(lib, 'not found')
        else:
            found.append((lib,obj['@id']))

    df = pandas.DataFrame(found, columns=['library_id', 'id'])
    print(df.head())
    df.to_csv('library_id_to_accession.tsv', sep='\t', index=False)
Exemple #6
0
def main():
    server = ENCODED('www.encodeproject.org')
    server.load_netrc()
    validator = DCCValidator(server)

    table = pandas.read_csv('publication_files.tsv', sep='\t', skipfooter=1)

    award = '/awards/UM1HG009443/'
    lab = '/labs/barbara-wold/'

    payload = {
        'award': award,
        'lab': lab,
        'related_files': list(table['file']),
        'references': ['/publications/e0d01543-9965-4edb-933c-778a40575cd9/'],
    }

    #validator.validate(payload, 'publication_data')
    print(server.post_json('/publication-data/', payload))
Exemple #7
0
def main():
    data_id = 'ENCSR574CRQ'
    #test_id = 'TSTSR910688'

    #libraries = ['17298', '17299', '15019', '15020', '16930',
    #             '16931', '16110', '16111', '15084', '15085',
    #             '16134', '16135',]

    server = ENCODED('www.encodeproject.org')
    #server = ENCODED('test.encodedcc.org')
    server.load_netrc()
    df = pandas.read_excel(
        'Mouse embryo samples list library numbers Diane August 21 2017.xlsx',
        sheet='Sheet 1',
        usecols=[0, 1, 2, 3],
        dtype={
            2: str,
            3: str
        })
    df = df.dropna()
    print(df)

    award = '/awards/UM1HG009443/'
    lab = '/labs/barbara-wold/'

    files = {}
    for l in df['Library number']:
        graph = server.search_jsonld(searchTerm='barbara-wold:{}'.format(l))
        for result in graph['@graph']:
            if 'Experiment' in result['@type']:
                experiment = server.get_json(result['@id'])
                print(result['@id'], l)
                for f in experiment['files']:
                    assembly = f.get('assembly')
                    genome_annotation = f.get('genome_annotation')
                    if assembly is None and genome_annotation is None:
                        files[f['@id']] = f.get('submitted_file_name')
                        print(f['@id'], f.get('submitted_file_name'))
                    elif assembly == 'mm10' and genome_annotation == 'M4':
                        files[f['@id']] = f.get('submitted_file_name')
                        print(f['@id'], f.get('submitted_file_name'))
                    else:
                        print('  ', assembly, genome_annotation)

    payload = {
        'award': award,
        'lab': lab,
        'related_files': list(files.keys()),
        'references': ['/publications/e0d01543-9965-4edb-933c-778a40575cd9/'],
    }
    #pprint(payload)
    #if data_id is None:
    #    result = server.post_json('/publication-data/', payload)
    #    print(result['@id'])
    results = server.patch_json(data_id, {'related_files': list(files)})
    print(results)
    print(results.get('@id'))
Exemple #8
0
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('-s', '--sheet', default=0, help='Sheet to use')
    parser.add_argument('--header', default=None, help="header row")
    parser.add_argument('filename', nargs=1, help='spreadsheet to look at')
    args = parser.parse_args(cmdline)

    header = int(args.header) if args.header is not None else None
    book = ODFReader(args.filename[0])
    data = book.parse(args.sheet, header=header)

    server = ENCODED('www.encodeproject.org')
    server.load_netrc()

    first_experiments = models.load_experiments(
        to_files(paper_433_experiment_files))
    all_experiments = models.load_experiments(
        to_files(ASOF_RUN17_experiment_files))

    first_libraries = set(parse_replicates(first_experiments['replicates']))
    all_libraries = set(parse_replicates(all_experiments['replicates']))

    #print(first_libraries)
    #print(all_libraries)
    results = []
    for i, library_id in enumerate(data[data.columns[0]]):
        if library_id in first_libraries:
            tranche = 1
        elif library_id in all_libraries:
            tranche = 2
        else:
            tranche = 'C'

        row = find_library_info(server, library_id)
        row['tranche'] = tranche
        results.append(row)

        if (i + 1) % 10:
            print('.', end='', flush=True)

    df = pandas.DataFrame(results)
    df.to_csv('tranche.csv', index=False)
def main(cmdline=None):
    data_id = 'ENCSR574CRQ'

    server = ENCODED('www.encodeproject.org')
    server.load_netrc()

    book = pandas.ExcelFile(
        'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx')
    files = book.parse('File')

    publication = server.get_json(data_id)
    publication_accessions = [
        x[len('/files/'):-1] for x in publication['files']
    ]
    print('submitted', len(files['accession']))
    print('posted', len(publication_accessions))
    print(
        'Intersect',
        len(
            set(files['accession'].values).intersection(
                publication_accessions)))
Exemple #10
0
def main(cmdline=None):
    data_id = 'ENCSR226XLF'

    server = ENCODED('www.encodeproject.org')
    server.load_netrc()

    book_name = 'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx'
    book = pandas.ExcelFile(book_name)
    file_sheet = book.parse('File')
    files = ['/files/{}/'.format(x) for x in file_sheet['accession']]
    print('Have {} files to update with'.format(len(files)))

    publication = server.get_json(data_id)
    print(publication['description'])
    print('Publication has {} files'.format(len(publication['files'])))
    p = input("Replace with {} files from {}? ".format(len(files), book_name))

    if p.lower().startswith('y'):
        response = server.patch_json(data_id, {'related_files': files})
        print(response)
Exemple #11
0
class TestEncoded(TestCase):
    def setUp(self):
        self.encode = ENCODED('www.encodeproject.org')
        self.encode._user = {
            '@context': '/terms/',
            '@id': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/',
            '@type': ['User', 'Item'],
            'first_name': 'Diane',
            'groups': [],
            'job_title': 'Submitter',
            'lab': {
                '@id': '/labs/barbara-wold/',
                '@type': ['Lab', 'Item'],
                'country': 'USA',
                'institute_label': 'Caltech',
                'institute_name': 'California Institute of Technology',
                'pi': '/users/0598c868-0b4a-4c5b-9112-8f85c5de5374/',
                'schema_version': '4',
                'title': 'Barbara Wold, Caltech',
                'uuid': '72d5666a-a361-4f7b-ab66-a88e11280937'
            },
            'last_name': 'Trout',
            'schema_version': '5',
            'submits_for': ['/labs/barbara-wold/',
                            '/labs/richard-myers/',
                            '/labs/ali-mortazavi/'],
            'uuid': 'bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a',
            }

        logging.disable(logging.WARNING)
        self.validator = DCCValidator(self.encode)
        for schema, filename in [('library', 'library.json'),
                                 ('biosample', 'biosample.json')]:
            schema_file = os.path.join(os.path.dirname(__file__), filename)
            with open(schema_file, 'rt') as instream:
                self.validator._schemas[schema] = json.load(instream)

    def tearDown(self):
        logging.disable(logging.NOTSET)

    def test_prepare_url(self):
        tests = [
            ('/experiments', 'https://www.encodeproject.org/experiments'),
            ('/experiments/ENCLB045ZZZ',
             'https://www.encodeproject.org/experiments/ENCLB045ZZZ'),
            ('https://www.encodeproject.org/experiments/ENCLB045ZZZ',
             'https://www.encodeproject.org/experiments/ENCLB045ZZZ'),
            ('barbara-wold:11111',
             'https://www.encodeproject.org/barbara-wold:11111')
        ]
        for url, result in tests:
            self.assertEqual(self.encode.prepare_url(url), result)

    def test_validate_library(self):
        """Test validation of a Library object
        """
        obj = {
            u'@id': u'/libraries/ENCLB045ZZZ/',
            u'@type': [u'Library', u'Item'],
            u'aliases': [],
            u'alternate_accessions': [],
            u'award': u'/awards/U54HG006998/',
            u'biosample': u'/biosamples/ENCBS089RNA/',
            u'date_created': u'2014-01-14T19:44:51.061770+00:00',
            u'documents': [],
            u'extraction_method': u'Ambion mirVana',
            u'fragmentation_method': u'chemical (Nextera tagmentation)',
            u'lab': u'/labs/barbara-wold/',
            u'library_size_selection_method': u'SPRI beads',
            u'lysis_method': u'Ambion mirVana',
            u'nucleic_acid_term_name': u'polyadenylated mRNA',
            u'size_range': u'>200',
            u'status': u'released',
            u'strand_specificity': False,
            u'treatments': [],
        }
        self.validator.validate(obj, 'library')

        # test requestMethod
        obj['schema_version'] = u'2'
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, obj, 'library')
        del obj['schema_version']

        # test calculatedProperty
        obj['nucleic_acid_term_name'] = u'SO:0000871'
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, obj, 'library')
        del obj['nucleic_acid_term_name']

        # test permssionValidator
        obj['uuid'] = u'42c46028-708f-4347-a3df-2c82dfb021c4'
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, obj, 'library')
        del obj['uuid']

    def test_validate_biosample(self):
        bio = {
            'aliases': ['barbara-wold:c1_e12.5_mouse_limb_donor'],
            'award': 'U54HG006998',
            'biosample_term_id': 'UBERON:0002101',
            'biosample_term_name': 'limb',
            'biosample_type': 'tissue',
            'date_obtained': '2017-02-01',
            'description': 'C57Bl6 wild-type embryonic mouse',
            'donor': '/mouse-donors/ENCDO956IXV/',
            'lab': '/labs/barbara-wold',
            'model_organism_age': '12.5',
            'model_organism_age_units': 'day',
            'mouse_life_stage': 'embryonic',
            'organism': '3413218c-3d86-498b-a0a2-9a406638e786',
            'source': '/sources/gems-caltech/',
            'starting_amount': 1,
            'starting_amount_units': 'items',
        }

        # test aliases
        bio['aliases'] = ['barbara-wold:c1-donor']
        self.validator.validate(bio, 'biosample')
        del bio['aliases']

        # test part_of
        bio['part_of'] = 'barbara-wold:c1-donor'
        self.validator.validate(bio, 'biosample')
        del bio['part_of']

        # tests linkTo
        self.validator.validate(bio, 'biosample')
        bio['organism'] = '/organisms/mouse/'

        bio['lab'] = '/labs/alkes-price/'
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, bio, 'biosample')
        bio['lab'] = '/labs/barbara-wold'

        bio['organism'] = "7745b647-ff15-4ff3-9ced-b897d4e2983c"
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, bio, 'biosample')
        bio['organism'] = "/organisms/human"
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, bio, 'biosample')
        bio['organism'] = '/organisms/mouse/'


    def test_aliases(self):
        """Test that objects being validated can access previous ones by alias

        Some properties can require that an object exist, since we validate
        before submitting, we need to be able cache object by its alias and use
        that to retrieve it.
        """
        donor = {
            'aliases': ['barbara-wold:c1_e12.5_mouse_limb_donor'],
            'award': 'U54HG006998',
            'biosample_term_id': 'UBERON:0002101',
            'biosample_term_name': 'limb',
            'biosample_type': 'tissue',
            'date_obtained': '2017-02-01',
            'description': 'C57Bl6 wild-type embryonic mouse',
            'donor': '/mouse-donors/ENCDO956IXV/',
            'lab': '/labs/barbara-wold',
            'model_organism_age': '12.5',
            'model_organism_age_units': 'day',
            'mouse_life_stage': 'embryonic',
            'organism': '3413218c-3d86-498b-a0a2-9a406638e786',
            'source': '/sources/gems-caltech/',
            'starting_amount': 1,
            'starting_amount_units': 'items',
        }
        part = donor.copy()
        part['aliases'] = ['barbara-wold:A7']
        part['part_of'] = 'barbara-wold:c1_e12.5_mouse_limb_donor'
        self.assertRaises(jsonschema.ValidationError, self.validator.validate, part, 'biosample')

        self.validator.validate(donor, 'biosample')
        self.validator.validate(part, 'biosample')

    def test_create_context(self):
        linked_id = {'@type': '@id'}
        library = {'@id': '/libraries/1234', '@type': ['Library', 'Item']}

        url = self.encode.prepare_url(library['@id'])
        context = self.encode.create_jsonld_context(library, url)
        self.assertEqual(context['@vocab'], 'https://www.encodeproject.org/profiles/library.json#')
        self.assertEqual(context['award'], linked_id )
        self._verify_context(context, 'Library')
        # namespaces not added yet.
        self.assertRaises(AssertionError, self._verify_namespaces, context)
        self.encode.add_jsonld_namespaces(context)
        self._verify_namespaces(context)

    def test_add_context(self):
        """Checking to make sure nested @base and @vocab urls are set correctly
        """
        obj = {
            "nucleic_acid_term_name": "RNA",
            "accession": "ENCLB044ZZZ",
            "@id": "/libraries/ENCLB044ZZZ/",
            "schema_version": "1",
            "@type": [
                "Library",
                "Item"
            ],
            "lysis_method": "Ambion mirVana",
            "nucleic_acid_term_id": "SO:0000356",
            "biosample": {
                "biosample_term_name": "GM12878",
                "description": "B-lymphocyte, lymphoblastoid, International HapMap Project - CEPH/Utah - European Caucasion, Epstein-Barr Virus",
                "accession": "ENCBS090RNA",
                "date_created": "2013-10-29T21:15:29.144260+00:00",
                "@id": "/biosamples/ENCBS090RNA/",
                "aliases": [
                "brenton-graveley:GM12878-2",
                "thomas-gingeras:191WC"
                ],
                "organism": "/organisms/human/",
                "@type": [
                "Biosample",
                "Item"
                ]
            },
        }

        bio_base = self.encode.prepare_url(obj['biosample']['@id'])

        url = self.encode.prepare_url('/libraries/ENCLB044ZZZ/?format=json&embed=False')
        obj_type = self.encode.get_object_type(obj)
        schema_url = self.encode.get_schema_url(obj_type)
        self.encode.add_jsonld_context(obj, url)

        self.assertEqual(obj['biosample']['@context']['@base'], bio_base)
        self.assertEqual(obj['@context']['@vocab'], schema_url)
        self._verify_context(obj['@context'], 'Library')
        self._verify_namespaces(obj['@context'])
        self._verify_context(obj['biosample']['@context'], 'Biosample')
        self.assertEqual(obj['@context']['rdf'], 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')
        self.assertEqual(obj['@context']['OBO'], 'http://purl.obolibrary.org/obo/')


    def test_convert_search_to_jsonld(self):
        example = {'count': {'biosamples': 2},
                   'portal_title': 'ENCODE',
                   'title': 'Search',
                   'notification': 'Success',
                   'filters': [],
                   '@id': '/search/?searchTerm=wold',
                   '@type': ['search'],
                   'facets': [],
                   '@graph': [{
                       u'@id': u'/biosamples/ENCBS125ENC/',
                       u'@type': [u'Biosample', u'Item'],
                       u'accession': u'ENCBS125ENC',
                       u'award.rfa': u'ENCODE2-Mouse',
                       u'biosample_term_name': u'myocyte',
                       u'biosample_type': u'in vitro differentiated cells',
                       u'characterizations.length': [],
                       u'constructs.length': [],
                       u'lab.title': u'Barbara Wold, Caltech',
                       u'life_stage': u'unknown',
                       u'organism.name': u'mouse',
                       u'source.title': u'Barbara Wold',
                       u'status': u'CURRENT',
                       u'treatments.length': []},
                              {u'@id': u'/biosamples/ENCBS126ENC/',
                               u'@type': [u'Biosample', u'Item'],
                               u'accession': u'ENCBS126ENC',
                               u'award.rfa': u'ENCODE2-Mouse',
                               u'biosample_term_name': u'myocyte',
                               u'biosample_type': u'in vitro differentiated cells',
                               u'characterizations.length': [],
                               u'constructs.length': [],
                               u'lab.title': u'Barbara Wold, Caltech',
                               u'life_stage': u'unknown',
                               u'organism.name': u'mouse',
                               u'source.title': u'Barbara Wold',
                               u'status': u'CURRENT',
                               u'treatments.length': []},
                   ]}

        result = self.encode.convert_search_to_jsonld(example)
        for obj in result['@graph']:
            self.assertNotIn('award.rfa', obj)

    def _verify_context(self, context, obj_type):
        for context_key in [None, obj_type]:
            for k in ENCODED_CONTEXT[context_key]:
                self.assertIn(k, context)
                self.assertEqual(ENCODED_CONTEXT[context_key][k], context[k])

    def _verify_namespaces(self, context):
        for k in ENCODED_NAMESPACES:
            self.assertIn(k, context)
            self.assertEqual(ENCODED_NAMESPACES[k], context[k])