Exemple #1
0
    def files(self):
        '''A list of all files in the downloads directory of this composite'''
        try:
            return self._files
        except AttributeError:
            md5sums = encodeUtils.readMd5sums(self._md5path)

            radict = dict()
            for stanza in self.alphaMetaDb.itervalues():
                if 'fileName' in stanza:
                    for file in stanza['fileName'].split(','):
                        radict[file] = stanza

            self._files = dict()
            for file in os.listdir(self.downloadsDirectory):
                if os.path.isfile(self.downloadsDirectory + file):

                    stanza = None
                    if file in radict:
                        stanza = radict[file]

                    if file in md5sums:
                        self._files[file] = TrackFile(
                            self.downloadsDirectory + file, md5sums[file],
                            stanza)
                    else:
                        self._files[file] = TrackFile(
                            self.downloadsDirectory + file, None, stanza)

            return self._files
Exemple #2
0
 def files(self):
     '''A list of all files in the downloads directory of this composite'''
     try:
         return self._files
     except AttributeError:
         md5sums = encodeUtils.readMd5sums(self._md5path)
         
         radict = dict()
         for stanza in self.alphaMetaDb.itervalues():
             if 'fileName' in stanza:
                 for file in stanza['fileName'].split(','):
                     radict[file] = stanza
         
         self._files = dict()
         for file in os.listdir(self.downloadsDirectory):
             if os.path.isfile(self.downloadsDirectory + file):
             
                 stanza = None
                 if file in radict:
                     stanza = radict[file]
                     
                 if file in md5sums:
                     self._files[file] = TrackFile(self.downloadsDirectory + file, md5sums[file], stanza)
                 else:
                     self._files[file] = TrackFile(self.downloadsDirectory + file, None, stanza)
     
         return self._files
Exemple #3
0
 def releases(self):
     '''A list of all files in the release directory of this composite'''
     try:
         return self._releaseFiles
     except AttributeError:
         self._releaseFiles = list()
         count = 1
         
         while os.path.exists(self.downloadsDirectory + 'release' + str(count)):
             releasepath = self.downloadsDirectory + 'release' + str(count) + '/'
             md5s = encodeUtils.readMd5sums(releasepath + 'md5sum.txt')
             releasefiles = dict()
             
             for file in os.listdir(releasepath):
                 if file != 'md5sum.txt' and md5s != None and file in md5s and not os.path.isdir(releasepath + file):
                     releasefiles[file] = TrackFile(releasepath + file, md5s[file])
                 elif not os.path.isdir(releasepath + file):
                     releasefiles[file] = TrackFile(releasepath + file, None)
                 elif os.path.isdir(releasepath + file):
                     if not re.match('.*supplemental.*', releasepath + file):
                         continue
                     for innerfile in os.listdir(releasepath + file):
                         pathfile = file + "/" + innerfile 
                         releasefiles[pathfile] = TrackFile(releasepath + pathfile, None)
     #releasefiles.sort()
             self._releaseFiles.append(releasefiles)
             count = count + 1
             
         return self._releaseFiles
Exemple #4
0
def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries, all=False, rep=False):
    
    print 'Creating HighThroughput soft file'

    softfile = HighThroughputSoftFile()
    fileList = list()
    
    createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries, all)
    
    if argseries:
        return softfile, fileList
    
    for idNum in expIds.iterkeys():
        
        expId = expIds[idNum]
        firstStanza = expId[0]
        if not all: print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
        sample = HighThroughputSampleStanza(softfile)

        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1, rep)
        sample['!Sample_type'] = 'SRA'
        sample['!Sample_title'] = sample['^SAMPLE']
        
        if 'geoSeriesAccession' in series:
            sample['!Sample_series_id'] = series['geoSeriesAccession']
            
        count = 1
        
        #figure out if the instrument model is consistent across the entire sample
        instrumentModel = None
        for stanza in expId:    
            if 'seqPlatform' in stanza:
                if instrumentModel == None:
                    instrumentModel = submission.instrumentModels[stanza['seqPlatform']]
                else:
                    if instrumentModel != submission.instrumentModels[stanza['seqPlatform']]:
                        instrumentModel = None
                        if audit:
                            print 'expId' + str(expId) + ': inconsistent instrument model'
                        break
        
        for stanza in expId:
        
            for fname in stanza['fileName'].split(','):
              
                file = compositeTrack.files[fname]
                filelist = list()
                
                if file.extension == 'fasta':
                    print 'WARNING: fastas detected!!!'
                
                if isRawFile(file):
                
                    if all:
                        continue
                
                    if file.name.endswith('.tgz') or file.name.endswith('.tar.gz'):
                    
                        if tarpath == None:
                            raise IOError('this track contains tarred fastqs. Please specify a path through the -z option')
                        dirname = tarpath + file.name.split('.')[0] + '/'
                        if os.path.exists(dirname):
                            print dirname + ' already exists, so not unzipping'
                        else:
                            print 'creating ' + dirname + '...'
                            os.mkdir(dirname)
                            os.system('tar -xf %s -C %s' % (file.path + file.name, dirname))
                        
                        for root, dirnames, filenames in os.walk(dirname):
                            for filename in filenames:
                                if 'reject' in filename or 'md5sum' in filename:
                                    continue
                                if filename.endswith('.fastq') or filename.endswith('.txt'):
                                    print 'gzipping ' + filename
                                    os.system('gzip %s' % (root + '/' + filename))
                        
                        for root, dirnames, filenames in os.walk(dirname):
                        
                            rootmd5s = None
                            if os.path.isfile(root + '/md5sum.txt'):
                                rootmd5s = encodeUtils.readMd5sums(root + '/md5sum.txt')
                            
                            for filename in filenames:
                                if 'reject' in filename or 'md5sum' in filename:
                                    continue
                                
                                print root + '/' + filename
                                
                                if rootmd5s != None and filename in rootmd5s:
                                    newmd5 = rootmd5s[filename]
                                else:
                                    newmd5 = encodeUtils.hashFile(root + '/' + filename)
                                    encodeUtils.writeMd5sums(root + '/md5sum.txt', filename, newmd5)
                                newfile = track.TrackFile(root + '/' + filename, newmd5)
                                
                                filelist.append(newfile)

                    else:
                        filelist.append(file)
                        
                    for f in filelist:
                        
                        sample['!Sample_raw_file_' + str(count)] = linkName(f, compositeTrack)
                        if f.extension == 'txt':
                            sample['!Sample_raw_file_type_' + str(count)] = 'fastq'
                        elif f.extension == 'csfasta':
                            sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta'
                        elif f.extension == 'csqual':
                            sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual'
                        else:
                            sample['!Sample_raw_file_type_' + str(count)] = f.extension
                        
                        sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum

                        if instrumentModel == None and 'seqPlatform' in stanza:
                            sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']]
                            
                        fileList.append(f)    
                        count = count + 1
            
        count = 1

        pooledStanza = dict()
        
        for stanza in expId:
        
            for fname in stanza['fileName'].split(','):
                file = compositeTrack.files[fname]
        
                if isSupplementaryFile(file):
                    sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
                    
                    if not all:
                        if file.md5sum != None:
                            sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
                    
                    sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
                    
                    if instrumentModel == None and 'seqPlatform' in stanza:
                        sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']]
                    
                    fileList.append(file)
                    count = count + 1
                    
            if 'objStatus' in stanza:
                continue
            for k in stanza.iterkeys():
                if k not in pooledStanza:
                    pooledStanza[k] = set()
                pooledStanza[k].add(stanza[k])
        for k in pooledStanza.iterkeys():
            pooledStanza[k] = ','.join(pooledStanza[k])
            
        
        if (idNum in geoMapping and geoMapping[idNum] != 'Inconsistent'):
            sample['!Sample_geo_accession'] = geoMapping[idNum]
        else:
        
            if all and 'geoSampleAccession' in pooledStanza:
                sample['!Sample_geo_accession'] = pooledStanza['geoSampleAccession']
        
            sample['!Sample_source_name'] = pooledStanza['cell']
            sample['!Sample_organism'] = compositeTrack.organism
            
            sample['!Sample_characteristics'] = list()
            allVars = expVars + mdbWhitelist
            
            for var in allVars:
                if var in pooledStanza:
                    foobar = var
                    sample['!Sample_characteristics'].append(var + ': ' + pooledStanza[var])
                    for pretend in cvPretend.iterkeys():
                        if var + ' ' + pooledStanza[var] == pretend:
                            foobar = cvPretend[pretend]
                    if foobar in cvDetails:
                        for cvVar in cvDetails[foobar]:
                            if cvVar in cvOverride and cvVar in pooledStanza:
                                sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + pooledStanza[cvVar])
                            elif cvVar in cv[pooledStanza[var]]:
                                sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[pooledStanza[var]][cvVar])
                    else:
                        for cvVar in cvDefaults:
                            if pooledStanza[var] in cv and cvVar in cv[pooledStanza[var]]:
                                sample['!Sample_characteristics'].append(var + ' ' +  cvVar + ': ' + cv[pooledStanza[var]][cvVar])
                    
            sample['!Sample_biomaterial_provider'] = cv[pooledStanza['cell']]['vendorName']
            
            if 'treatment' in pooledStanza:
                sample['!Sample_treatment_protocol'] = pooledStanza['treatment']
            
            if 'protocol' in cv[pooledStanza['cell']]:
                for protocol in cv[pooledStanza['cell']]['protocol'].split(' '):
                        if protocol == 'missing':
                            continue
                        if ':' not in protocol:
                            raise KeyError(protocol + ' is not valid')
                        key, val = protocol.split(':')
                        if key == 'ENCODE' or key == cv[pooledStanza['lab']]['labPi']:
                            sample['!Sample_growth_protocol'] = val
            
            if datatype.molecule == 'RNA':
                if 'rnaExtract' not in pooledStanza:
                    sample['!Sample_molecule'] = 'total RNA'
                elif pooledStanza['rnaExtract'] in submission.rnaExtractMapping:
                    sample['!Sample_molecule'] = submission.rnaExtractMapping[pooledStanza['rnaExtract']]
                elif pooledStanza['localization'] in submission.localizationMapping:
                    sample['!Sample_molecule'] = submission.localizationMapping[pooledStanza['localization']]
                    
            else:
                sample['!Sample_molecule'] = datatype.molecule
                
            if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown':
                sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (submission.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
            else:
                sample['!Sample_extract_protocol'] = compositeTrack.url
            sample['!Sample_library_strategy'] = datatype.strategy
            sample['!Sample_library_source'] = datatype.source
            sample['!Sample_library_selection'] = datatype.selection
            
            # if the instrumentModel is consistent, just use that
            # otherwise take the first seqPlatform value from metadata
            # if that still fails, check the replacement file
            # finally just make it say [REPLACE]
            if instrumentModel != None:
                sample['!Sample_instrument_model'] = instrumentModel
            else:
                for stanza in expId:    
                    if 'seqPlatform' in stanza:
                        sample['!Sample_instrument_model'] = submission.instrumentModels[stanza['seqPlatform']]
                        break
                if '!Sample_instrument_model' not in sample:
                    if '!Sample_instrument_model' in replace:
                        sample['!Sample_instrument_model'] = submission.instrumentModels[replace['!Sample_instrument_model'][0]]
                if '!Sample_instrument_model' not in sample:
                    sample['!Sample_instrument_model'] = '[REPLACE]'
                    if audit:
                        print stanza.name + ': no instrument'
                    
            sample['!Sample_data_processing'] = compositeTrack.url
            
        softfile[sample['^SAMPLE']] = sample
        
    return softfile, fileList
Exemple #5
0
def createHighThroughputSoftFile(compositeTrack,
                                 cv,
                                 expIds,
                                 expVars,
                                 geoMapping,
                                 series,
                                 datatype,
                                 replace,
                                 audit,
                                 tarpath,
                                 argseries,
                                 all=False,
                                 rep=False):

    print 'Creating HighThroughput soft file'

    softfile = HighThroughputSoftFile()
    fileList = list()

    createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series,
                 datatype, replace, audit, argseries, all)

    if argseries:
        return softfile, fileList

    for idNum in expIds.iterkeys():

        expId = expIds[idNum]
        firstStanza = expId[0]
        if not all:
            print 'Writing sample ' + firstStanza[
                'metaObject'] + ' (' + idNum + ')'
        sample = HighThroughputSampleStanza(softfile)

        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1, rep)
        sample['!Sample_type'] = 'SRA'
        sample['!Sample_title'] = sample['^SAMPLE']

        if 'geoSeriesAccession' in series:
            sample['!Sample_series_id'] = series['geoSeriesAccession']

        count = 1

        #figure out if the instrument model is consistent across the entire sample
        instrumentModel = None
        for stanza in expId:
            if 'seqPlatform' in stanza:
                if instrumentModel == None:
                    instrumentModel = submission.instrumentModels[
                        stanza['seqPlatform']]
                else:
                    if instrumentModel != submission.instrumentModels[
                            stanza['seqPlatform']]:
                        instrumentModel = None
                        if audit:
                            print 'expId' + str(
                                expId) + ': inconsistent instrument model'
                        break

        for stanza in expId:

            for fname in stanza['fileName'].split(','):

                file = compositeTrack.files[fname]
                filelist = list()

                if file.extension == 'fasta':
                    print 'WARNING: fastas detected!!!'

                if isRawFile(file):

                    if all:
                        continue

                    if file.name.endswith('.tgz') or file.name.endswith(
                            '.tar.gz'):

                        if tarpath == None:
                            raise IOError(
                                'this track contains tarred fastqs. Please specify a path through the -z option'
                            )
                        dirname = tarpath + file.name.split('.')[0] + '/'
                        if os.path.exists(dirname):
                            print dirname + ' already exists, so not unzipping'
                        else:
                            print 'creating ' + dirname + '...'
                            os.mkdir(dirname)
                            os.system('tar -xf %s -C %s' %
                                      (file.path + file.name, dirname))

                        for root, dirnames, filenames in os.walk(dirname):
                            for filename in filenames:
                                if 'reject' in filename or 'md5sum' in filename:
                                    continue
                                if filename.endswith(
                                        '.fastq') or filename.endswith('.txt'):
                                    print 'gzipping ' + filename
                                    os.system('gzip %s' %
                                              (root + '/' + filename))

                        for root, dirnames, filenames in os.walk(dirname):

                            rootmd5s = None
                            if os.path.isfile(root + '/md5sum.txt'):
                                rootmd5s = encodeUtils.readMd5sums(
                                    root + '/md5sum.txt')

                            for filename in filenames:
                                if 'reject' in filename or 'md5sum' in filename:
                                    continue

                                print root + '/' + filename

                                if rootmd5s != None and filename in rootmd5s:
                                    newmd5 = rootmd5s[filename]
                                else:
                                    newmd5 = encodeUtils.hashFile(root + '/' +
                                                                  filename)
                                    encodeUtils.writeMd5sums(
                                        root + '/md5sum.txt', filename, newmd5)
                                newfile = track.TrackFile(
                                    root + '/' + filename, newmd5)

                                filelist.append(newfile)

                    else:
                        filelist.append(file)

                    for f in filelist:

                        sample['!Sample_raw_file_' + str(count)] = linkName(
                            f, compositeTrack)
                        if f.extension == 'txt':
                            sample['!Sample_raw_file_type_' +
                                   str(count)] = 'fastq'
                        elif f.extension == 'csfasta':
                            sample['!Sample_raw_file_type_' +
                                   str(count)] = 'SOLiD_native_csfasta'
                        elif f.extension == 'csqual':
                            sample['!Sample_raw_file_type_' +
                                   str(count)] = 'SOLiD_native_qual'
                        else:
                            sample['!Sample_raw_file_type_' +
                                   str(count)] = f.extension

                        sample['!Sample_raw_file_checksum_' +
                               str(count)] = f.md5sum

                        if instrumentModel == None and 'seqPlatform' in stanza:
                            sample['!Sample_raw_file_instrument_model_' +
                                   str(count)] = submission.instrumentModels[
                                       stanza['seqPlatform']]

                        fileList.append(f)
                        count = count + 1

        count = 1

        pooledStanza = dict()

        for stanza in expId:

            for fname in stanza['fileName'].split(','):
                file = compositeTrack.files[fname]

                if isSupplementaryFile(file):
                    sample['!Sample_supplementary_file_' +
                           str(count)] = linkName(file, compositeTrack)

                    if not all:
                        if file.md5sum != None:
                            sample['!Sample_supplementary_file_checksum_' +
                                   str(count)] = file.md5sum

                    sample['!Sample_supplementary_file_build_' +
                           str(count)] = compositeTrack.database

                    if instrumentModel == None and 'seqPlatform' in stanza:
                        sample['!Sample_supplementary_file_instrument_model_' +
                               str(count)] = submission.instrumentModels[
                                   stanza['seqPlatform']]

                    fileList.append(file)
                    count = count + 1

            if 'objStatus' in stanza:
                continue
            for k in stanza.iterkeys():
                if k not in pooledStanza:
                    pooledStanza[k] = set()
                pooledStanza[k].add(stanza[k])
        for k in pooledStanza.iterkeys():
            pooledStanza[k] = ','.join(pooledStanza[k])

        if (idNum in geoMapping and geoMapping[idNum] != 'Inconsistent'):
            sample['!Sample_geo_accession'] = geoMapping[idNum]
        else:

            if all and 'geoSampleAccession' in pooledStanza:
                sample['!Sample_geo_accession'] = pooledStanza[
                    'geoSampleAccession']

            sample['!Sample_source_name'] = pooledStanza['cell']
            sample['!Sample_organism'] = compositeTrack.organism

            sample['!Sample_characteristics'] = list()
            allVars = expVars + mdbWhitelist

            for var in allVars:
                if var in pooledStanza:
                    foobar = var
                    sample['!Sample_characteristics'].append(var + ': ' +
                                                             pooledStanza[var])
                    for pretend in cvPretend.iterkeys():
                        if var + ' ' + pooledStanza[var] == pretend:
                            foobar = cvPretend[pretend]
                    if foobar in cvDetails:
                        for cvVar in cvDetails[foobar]:
                            if cvVar in cvOverride and cvVar in pooledStanza:
                                sample['!Sample_characteristics'].append(
                                    var + ' ' + cvVar + ': ' +
                                    pooledStanza[cvVar])
                            elif cvVar in cv[pooledStanza[var]]:
                                sample['!Sample_characteristics'].append(
                                    var + ' ' + cvVar + ': ' +
                                    cv[pooledStanza[var]][cvVar])
                    else:
                        for cvVar in cvDefaults:
                            if pooledStanza[var] in cv and cvVar in cv[
                                    pooledStanza[var]]:
                                sample['!Sample_characteristics'].append(
                                    var + ' ' + cvVar + ': ' +
                                    cv[pooledStanza[var]][cvVar])

            sample['!Sample_biomaterial_provider'] = cv[
                pooledStanza['cell']]['vendorName']

            if 'treatment' in pooledStanza:
                sample['!Sample_treatment_protocol'] = pooledStanza[
                    'treatment']

            if 'protocol' in cv[pooledStanza['cell']]:
                for protocol in cv[pooledStanza['cell']]['protocol'].split(
                        ' '):
                    if protocol == 'missing':
                        continue
                    if ':' not in protocol:
                        raise KeyError(protocol + ' is not valid')
                    key, val = protocol.split(':')
                    if key == 'ENCODE' or key == cv[
                            pooledStanza['lab']]['labPi']:
                        sample['!Sample_growth_protocol'] = val

            if datatype.molecule == 'RNA':
                if 'rnaExtract' not in pooledStanza:
                    sample['!Sample_molecule'] = 'total RNA'
                elif pooledStanza[
                        'rnaExtract'] in submission.rnaExtractMapping:
                    sample['!Sample_molecule'] = submission.rnaExtractMapping[
                        pooledStanza['rnaExtract']]
                elif pooledStanza[
                        'localization'] in submission.localizationMapping:
                    sample[
                        '!Sample_molecule'] = submission.localizationMapping[
                            pooledStanza['localization']]

            else:
                sample['!Sample_molecule'] = datatype.molecule

            if '!Sample_instrument_model' in replace and replace[
                    '!Sample_instrument_model'][0] == 'Unknown':
                sample[
                    '!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (
                        submission.instrumentModels[
                            replace['!Sample_instrument_model'][0]],
                        compositeTrack.url)
            else:
                sample['!Sample_extract_protocol'] = compositeTrack.url
            sample['!Sample_library_strategy'] = datatype.strategy
            sample['!Sample_library_source'] = datatype.source
            sample['!Sample_library_selection'] = datatype.selection

            # if the instrumentModel is consistent, just use that
            # otherwise take the first seqPlatform value from metadata
            # if that still fails, check the replacement file
            # finally just make it say [REPLACE]
            if instrumentModel != None:
                sample['!Sample_instrument_model'] = instrumentModel
            else:
                for stanza in expId:
                    if 'seqPlatform' in stanza:
                        sample[
                            '!Sample_instrument_model'] = submission.instrumentModels[
                                stanza['seqPlatform']]
                        break
                if '!Sample_instrument_model' not in sample:
                    if '!Sample_instrument_model' in replace:
                        sample[
                            '!Sample_instrument_model'] = submission.instrumentModels[
                                replace['!Sample_instrument_model'][0]]
                if '!Sample_instrument_model' not in sample:
                    sample['!Sample_instrument_model'] = '[REPLACE]'
                    if audit:
                        print stanza.name + ': no instrument'

            sample['!Sample_data_processing'] = compositeTrack.url

        softfile[sample['^SAMPLE']] = sample

    return softfile, fileList