Beispiel #1
0
def LoadDataSet(vcfInfile, traningSet, qFaLen):

    if len(traningSet) == 0: raise ValueError('[ERROR] No Training Data found')
    if vcfInfile[-3:] == '.gz':
        I = os.popen('gzip -dc %s' % vcfInfile)
    else:
        I = open(vcfInfile)

    data, hInfo = [], VCF.VCFHeader()
    while 1: # VCF format

        lines = I.readlines(100000)
        if not lines: break
        for line in lines:

            col = line.strip('\n').split()
            if re.search(r'^#CHROM', line): col2sam = {i+9:sam for i,sam in enumerate(col[9:])}

            # Record the header information
            if re.search(r'^#', line):
                hInfo.Record(line.strip('\n'))
                continue

            # Get inbreeding coefficient. If fail then continue.
            # It's calculated like: 1.0 - hetCount/Expected_hetCount in VCF
            #inbCoeff = re.search(r';F=([^;]+)', col[7])
            inbCoeff = re.search(r';?InbCoeff=([^;]+)', col[7])
            if not inbCoeff:
                continue
                #print >> sys.stderr, '[ERROR] No inbreeding coefficient "InbCoeff=..." in INFO field in vcf:\n%s\n' % vcfInfile
            inbCoeff = float('%.2f' % float(inbCoeff.group(1)))

            fmat = {k:i for i,k in enumerate(col[8].split(':'))} # Get Format
            if 'QR' not in fmat: continue # Cause by INTERGAP. But We'd better delete this statment, because the error is cause by the USER 

            for tag in ['AA', 'QR', 'NR']:
                if tag not in fmat: raise ValueError('[ERROR] The "Format" fields did not contian "%s" in VCF: %s\nAT: %s\n' %(tag, vcfInfile, line))

            isBiallelic = True
            if len(col[4].split(',')) > 1: isBiallelic = False

            annotations = []
            atleastOne  = False
            for i, sample in enumerate(col[9:]): 

                sampleId  = col2sam[9+i]
                if sample == './.': continue
                field = sample.split(':')
                if len(field[fmat['AA']].split(',')) != 4: continue

                if len(field) < fmat['QR'] + 1: continue
                qr    = field[fmat['QR']].split(',')[-1]
                if qr == '.': continue

                atleastOne = True
                qregion    = np.array(qr.split('-'))
                if len(qregion) > 3: qId = qregion[0] + '-' + qregion[1]
                else               : qId = qregion[0]
                qSta = string.atoi(qregion[-2])
                qEnd = string.atoi(qregion[-1])

                if sampleId not in qFaLen          : raise ValueError('[ERROR] The sample name $s(in vcf) is not in the name of Fa list.' % sampleId)
                if      qId not in qFaLen[sampleId]: raise ValueError('[ERROR]', qId, 'is not been found in fa file\n')
                qSta = int(qSta * 100 / qFaLen[sampleId][qId] + 0.5)
                qEnd = int(qEnd * 100 / qFaLen[sampleId][qId] + 0.5)
                if qSta > 100: qSta = 100 # Bug!!! Should delete
                if qEnd > 100: qEnd = 100 # Bug!!! Should delete
                if qSta > 100 or qEnd > 100: 
                    raise ValueError('[ERROR] Query size Overflow! sample: %s; scaffold: %s\n%s\n%s' % (sampleId, qId, sample, line))

                leg = min(qSta, 100 - qEnd)
                nn  = string.atof(sample.split(':')[fmat['NR']])
                n   = int(1000 * nn + 0.5) / 10.0 # n ratio range: [0, 100]
                alt = string.atoi(sample.split(':')[fmat['AA']].split(',')[1]) # Alternate perfect
                bot = string.atoi(sample.split(':')[fmat['AA']].split(',')[3]) # Both imperfect
                annotations.append([isBiallelic, inbCoeff, leg, n , alt, bot])

            if not atleastOne: raise ValueError('[ERROR] All the samples don\'t contain this variant.', col)
            datum                = vd.VariantDatum()
            datum.annotations    = np.median(annotations, axis = 0)
            pos                  = col[0] + ':' + col[1]
            datum.variantOrder   = pos
            if pos in traningSet: datum.atTrainingSite = True
            data.append(datum)

    I.close()

    return hInfo, np.array(data)