def LoadDataSet(vcfInfile, traningSet, qFaLen): if len(traningSet) == 0: raise ValueError('[ERROR] No Training Data found') if vcfInfile[-3:] == '.gz': I = os.popen('gzip -dc %s' % vcfInfile) else: I = open(vcfInfile) data, hInfo = [], VCF.VCFHeader() while 1: # VCF format lines = I.readlines(100000) if not lines: break for line in lines: col = line.strip('\n').split() if re.search(r'^#CHROM', line): col2sam = {i+9:sam for i,sam in enumerate(col[9:])} # Record the header information if re.search(r'^#', line): hInfo.Record(line.strip('\n')) continue # Get inbreeding coefficient. If fail then continue. # It's calculated like: 1.0 - hetCount/Expected_hetCount in VCF #inbCoeff = re.search(r';F=([^;]+)', col[7]) inbCoeff = re.search(r';?InbCoeff=([^;]+)', col[7]) if not inbCoeff: continue #print >> sys.stderr, '[ERROR] No inbreeding coefficient "InbCoeff=..." in INFO field in vcf:\n%s\n' % vcfInfile inbCoeff = float('%.2f' % float(inbCoeff.group(1))) fmat = {k:i for i,k in enumerate(col[8].split(':'))} # Get Format if 'QR' not in fmat: continue # Cause by INTERGAP. But We'd better delete this statment, because the error is cause by the USER for tag in ['AA', 'QR', 'NR']: if tag not in fmat: raise ValueError('[ERROR] The "Format" fields did not contian "%s" in VCF: %s\nAT: %s\n' %(tag, vcfInfile, line)) isBiallelic = True if len(col[4].split(',')) > 1: isBiallelic = False annotations = [] atleastOne = False for i, sample in enumerate(col[9:]): sampleId = col2sam[9+i] if sample == './.': continue field = sample.split(':') if len(field[fmat['AA']].split(',')) != 4: continue if len(field) < fmat['QR'] + 1: continue qr = field[fmat['QR']].split(',')[-1] if qr == '.': continue atleastOne = True qregion = np.array(qr.split('-')) if len(qregion) > 3: qId = qregion[0] + '-' + qregion[1] else : qId = qregion[0] qSta = string.atoi(qregion[-2]) qEnd = string.atoi(qregion[-1]) if sampleId not in qFaLen : raise ValueError('[ERROR] The sample name $s(in vcf) is not in the name of Fa list.' % sampleId) if qId not in qFaLen[sampleId]: raise ValueError('[ERROR]', qId, 'is not been found in fa file\n') qSta = int(qSta * 100 / qFaLen[sampleId][qId] + 0.5) qEnd = int(qEnd * 100 / qFaLen[sampleId][qId] + 0.5) if qSta > 100: qSta = 100 # Bug!!! Should delete if qEnd > 100: qEnd = 100 # Bug!!! Should delete if qSta > 100 or qEnd > 100: raise ValueError('[ERROR] Query size Overflow! sample: %s; scaffold: %s\n%s\n%s' % (sampleId, qId, sample, line)) leg = min(qSta, 100 - qEnd) nn = string.atof(sample.split(':')[fmat['NR']]) n = int(1000 * nn + 0.5) / 10.0 # n ratio range: [0, 100] alt = string.atoi(sample.split(':')[fmat['AA']].split(',')[1]) # Alternate perfect bot = string.atoi(sample.split(':')[fmat['AA']].split(',')[3]) # Both imperfect annotations.append([isBiallelic, inbCoeff, leg, n , alt, bot]) if not atleastOne: raise ValueError('[ERROR] All the samples don\'t contain this variant.', col) datum = vd.VariantDatum() datum.annotations = np.median(annotations, axis = 0) pos = col[0] + ':' + col[1] datum.variantOrder = pos if pos in traningSet: datum.atTrainingSite = True data.append(datum) I.close() return hInfo, np.array(data)