예제 #1
0
    def endElement(self, name):
        if 'Result' == name:
            self.count += 1
            if 0 == self.count % 256:
                self.log('processed %s file references' % (self.count))
            self.totalFileSize += int(self.info[CGHubRecordInfo.bamFilesize_index])
            record = CGHubRecordInfo(self.info)
            filename = record.files['bam'].filename
            if self.removedups and filename in self.filename2cghubRecords:
                self.dupcount += 1
                # check the dates and keep the latest
                currentdate = self.createDateTime(self.filename2cghubRecords[filename].upload_date)
                newdate = self.createDateTime(record.upload_date)
                if currentdate < newdate:
                    self.filename2cghubRecords[filename] = record
            else:
                self.filename2cghubRecords[filename] = record
            self.inResult = False
            self.info = ['' for _ in range(CGHubRecordInfo.infoCount)]
            self.fileIndices = 0
            if 'live' == record.state:
                if self.minmaxsize['min'].filesize > record.files['bam'].filesize and record.files['bam'].filesize:
                    self.minmaxsize['min'] = record.files['bam']
                if self.minmaxsize['max'].filesize < record.files['bam'].filesize:
                    self.minmaxsize['max'] = record.files['bam']
                if not record.files['bam'].filesize:
                    self.log('no file size: %s--%s' % (record.write(), record.files['bam'].write()))
            if self.limit and self.count >= self.limit_count:
                self.endDocument()
                raise LimitReachedException()
                
        elif name in element2index and -1 == element2index[name] and self.hasChars:
#             self.log('WARNING: found value for %s:%s = %s' % (name, self.count, self.unexpectedChars))
            pass
        self.inElement = -1
        self.hasChars = False
예제 #2
0
def main(platform,
         type_uri='detail',
         log=None,
         removedups=False,
         limit=-1,
         verbose=False,
         print_response=False):
    util.log_info(log, 'begin reading cghub archive')
    filename2cghubRecords = {}
    minmaxsize = {
        'min': CGHubFileInfo('', 500000000000, ''),
        'max': CGHubFileInfo('', 1, '')
    }
    try:
        #         archives = util.getURLData(manifest_uri, 'latest_manifest', log)
        response = urllib.urlopen(manifest_uri)
        archives = response.read()

        lines = archives.split('\n')
        util.log_info(
            log,
            '\tarchive size is %s with %s lines' % (len(archives), len(lines)))
        util.log_info(log, '\n\t' + '\n\t'.join(lines[:10]))
    except Exception as e:
        util.log_exception(log, 'problem fetching latest_manifest: %s')
        raise e

    headers = lines[0].split('\t')
    column_index2header = {}
    for index, header in enumerate(headers):
        if header in header2record_index.keys():
            column_index2header[index] = header

    count = 0
    dupcount = 0
    for line in lines[1:]:
        if not line:
            continue
        if 0 == count % 4096:
            util.log_info(log, 'processed %s records' % (count))
        count += 1
        fields = line.split('\t')
        header2record = {}
        try:
            for index in column_index2header.keys():
                header2record[header2record_index[
                    column_index2header[index]]] = fields[index]
        except Exception as e:
            util.log_info(log,
                          'problem with parsing line(%s): %s' % (count, line))
        if platform not in header2record[CGHubRecordInfo.study_index]:
            continue
        header2record.update(index2none)
        record = CGHubRecordInfo(header2record)
        filename = header2record[CGHubRecordInfo.bamFilename_index]
        if removedups and filename in filename2cghubRecords:
            if 'Live' == header2record[CGHubRecordInfo.state_index]:
                dupcount += 1
                # check the dates and keep the latest
                currentdate = createDateTime(
                    filename2cghubRecords[filename].upload_date)
                newdate = createDateTime(record.upload_date)
                if currentdate < newdate:
                    filename2cghubRecords[filename] = record
        else:
            filename2cghubRecords[filename] = record
        if 'Live' == record.state:
            if minmaxsize['min'].filesize > record.files[
                    'bam'].filesize and record.files['bam'].filesize:
                minmaxsize['min'] = record.files['bam']
            if minmaxsize['max'].filesize < record.files['bam'].filesize:
                minmaxsize['max'] = record.files['bam']
            if not record.files['bam'].filesize:
                util.log_info(
                    log, 'no file size: %s--%s' %
                    (record.write(), record.files['bam'].write()))

    statistics(log, filename2cghubRecords, minmaxsize, verbose)
    util.log_info(
        log,
        'finished reading cghub archive.  %s total records, %s duplicates' %
        (count, dupcount))
    return filename2cghubRecords.values(), minmaxsize, archives
def main(platform, type_uri = 'detail', log = None, removedups = False, limit = -1, verbose = False, print_response = False):
    util.log_info(log, 'begin reading cghub archive')
    filename2cghubRecords = {}
    minmaxsize = {'min': CGHubFileInfo('', 500000000000, ''), 'max': CGHubFileInfo('', 1, '')}
    try:
#         archives = util.getURLData(manifest_uri, 'latest_manifest', log)
        response = urllib.urlopen(manifest_uri)
        archives = response.read()

        lines = archives.split('\n')
        util.log_info(log, '\tarchive size is %s with %s lines' % (len(archives), len(lines)))
        util.log_info(log, '\n\t' + '\n\t'.join(lines[:10]))
    except Exception as e:
        util.log_exception(log, 'problem fetching latest_manifest: %s')
        raise e
    
    headers = lines[0].split('\t')
    column_index2header = {}
    for index, header in enumerate(headers):
        if header in header2record_index.keys():
            column_index2header[index] = header
        
    count = 0
    dupcount = 0
    for line in lines[1:]:
        if not line:
            continue
        if 0 == count % 4096:
            util.log_info(log, 'processed %s records' % (count))
        count += 1
        fields = line.split('\t')
        header2record = {}
        try:
            for index in column_index2header.keys():
                header2record[header2record_index[column_index2header[index]]] = fields[index]
        except Exception as e:
            util.log_info(log, 'problem with parsing line(%s): %s' % (count, line))
        if platform not in header2record[CGHubRecordInfo.study_index]:
            continue
        header2record.update(index2none)
        record = CGHubRecordInfo(header2record)
        filename = header2record[CGHubRecordInfo.bamFilename_index]
        if removedups and filename in filename2cghubRecords:
            if 'Live' == header2record[CGHubRecordInfo.state_index]:
                dupcount += 1
                # check the dates and keep the latest
                currentdate = createDateTime(filename2cghubRecords[filename].upload_date)
                newdate = createDateTime(record.upload_date)
                if currentdate < newdate:
                    filename2cghubRecords[filename] = record
        else:
            filename2cghubRecords[filename] = record
        if 'Live' == record.state:
            if minmaxsize['min'].filesize > record.files['bam'].filesize and record.files['bam'].filesize:
                minmaxsize['min'] = record.files['bam']
            if minmaxsize['max'].filesize < record.files['bam'].filesize:
                minmaxsize['max'] = record.files['bam']
            if not record.files['bam'].filesize:
                util.log_info(log, 'no file size: %s--%s' % (record.write(), record.files['bam'].write()))

    statistics(log, filename2cghubRecords, minmaxsize, verbose)
    util.log_info(log, 'finished reading cghub archive.  %s total records, %s duplicates' % (count, dupcount))
    return filename2cghubRecords.values(), minmaxsize