def get_align_parts(data):
    data['leftSeqFileName'] =  data['alignOutDir'] + '/' + 'Contig.genomedir.fa.left'
    data['rightSeqFileName'] = data['alignOutDir'] + '/' + 'Contig.genomedir.fa.right'    
    data['genomeFragAlignFileName'] = data['alignOutDir'] + '/' + 'genomeFrag.align.fa'    
    
    contigSeq = genutils.read_fasta_to_string(data['contigSeqFileName'])
    
#    print data['contigLeftFragAlignBegin'],data['contigLeftFragAlignEnd']
#    print data['contigRightFragAlignBegin'],data['contigRightFragAlignEnd']
#    print data['genomeFragAlignBegin'],data['genomeFragAlignEnd']
        
    leftSeq = contigSeq[data['contigLeftFragAlignBegin']-1:data['contigLeftFragAlignEnd']]
    rightSeq = contigSeq[data['contigRightFragAlignBegin']-1:data['contigRightFragAlignEnd']]
    leftSeqStr = genutils.add_breaks_to_line(leftSeq)    
    outFile= open(data['leftSeqFileName'],'w')
    outFile.write('>left\n%s\n' % leftSeqStr)
    outFile.close()
    rightSeqStr = genutils.add_breaks_to_line(rightSeq)    
    outFile= open(data['rightSeqFileName'],'w')
    outFile.write('>right\n%s\n' % rightSeqStr)
    outFile.close()

    # the genome part
    region = data['chromName'] + ':' + str(data['genomeFragAlignBegin']) + '-' + str(data['genomeFragAlignEnd'])
    
    cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + data['genomeFragAlignFileName']
#    print cmd
    genutils.runCMD(cmd)
def run_align(data):
    data['leftAlignFileName'] = data['leftSeqFileName'] + '.align'
    data['rightAlignFileName'] = data['rightSeqFileName'] + '.align'

    
    cmd = 'stretcher ' + data['leftSeqFileName'] + ' ' + data['genomeFragAlignFileName'] + ' ' + data['leftAlignFileName'] 
    print cmd
    genutils.runCMD(cmd)

    cmd = 'stretcher ' + data['rightSeqFileName'] + ' ' + data['genomeFragAlignFileName'] + ' ' + data['rightAlignFileName'] 
    print cmd
    genutils.runCMD(cmd)
def get_genome_frag(data):
    region = data['chromName'] + ':' + str(data['chromFragStart']) + '-' + str(data['chromFragEnd'])
    
    data['genomeFragFileName'] = data['alignOutDir'] + '/' + 'genomeFrag.fa'
    
    cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + data['genomeFragFileName']
    print cmd
    genutils.runCMD(cmd)
    
    genomeSeq = genutils.read_fasta_to_string(data['genomeFragFileName'])
    genomeSeq = genomeSeq.upper()
    data['genomeFragSeq'] = genomeSeq
def run_rm(data,run=True):
    # change species here
    cmd = 'RepeatMasker ' + data['genomeFragFileName']

    if run is True:
        genutils.runCMD(cmd)

    data['genomeFragFileNameRM'] = data['genomeFragFileName'] + '.out'

    # change species here
    cmd = 'RepeatMasker  ' + data['contigSeqFileName']
#    print cmd
    if run is True:
        genutils.runCMD(cmd)

    data['contigSeqFileNameRM'] = data['contigSeqFileName'] + '.out'    
def novelCoordinates(coord): # Stores coordinates of novel contigs within chrNovel.fa 

	infofile = '/home/jmkidd/kidd-lab-scratch/feichens-projects/kmer/canFam31/unique_kmers/canFam3.1-withnovel/chromNovel.merge.info'
	infoFile = open(infofile, 'r')
	
	for line in infoFile:
		line = line.rstrip()
		line = line.split()
		tmp = line[0]
		
		if re.match(r">(\S+)", tmp) is not None:
			match = re.match(r">(\S+)", tmp)
			data['contigID'] = match.group(1)
			data['offset'] = int(line[3]) - 1
			#				chrom			start[0]	end[1]		chrom[2]	offset[3]
			coordTable[data['contigID']] = [line[3],line[4], data['contigID'], data['offset']]
		
	novelfile = '/home/ampend/kidd-lab/ampend-projects/CGH_Array_Design_Dog/CGH_Array_Analysis/inputData/ProbeBEDFiles/NocanFam3Coords/novelContigs.probesel.pass.bed'
	novelFile = open(novelfile, 'r')
	
	correctednovel = novelfile + '.corrected'
	correctedNovel = open(correctednovel, 'w')	

	for line in novelFile:
		line = line.rstrip()
		line = line.split()
		
		data['novelChromID'] = line[0]
		data['probeID'] = line[3]
	
		if data['novelChromID'] in coordTable:
			if data['novelChromID'] == 'zoey-scaffold-686':
				continue
			data['novelChrom'] = 'chrNovel'
			#data['novelChrom'] = coordTable[data['novelChromID']][2]
			data['novelStart'] = int(coordTable[data['novelChromID']][3]) + int(line[1])
			data['novelEnd'] = int(coordTable[data['novelChromID']][3]) + int(line[2])
			data['novelProbeID'] = data['probeID']
			
			novelTable[data['novelProbeID']] = (data['novelChrom'], data['novelStart'], data['novelEnd'],data['novelProbeID'])			
			correctedNovel.write('%s\t%s\t%s\t%s\n' % (data['novelChrom'],data['novelStart'],data['novelEnd'],data['novelProbeID']))

	cmd = 'cat %s../inputData/ProbeBEDFiles/Mappable/* %s../inputData/ProbeBEDFiles/NocanFam3Coords/ChrY.probesel.pass.bed %s../inputData/ProbeBEDFiles/NocanFam3Coords/LINEs_RefInsertions.probesel.bed.pass.bed.sorted %s../inputData/ProbeBEDFiles/NocanFam3Coords/novelContigs.probesel.pass.bed.corrected %s../inputData/ProbeBEDFiles/NocanFam3Coords/SINEs_ReferenceInsertions.probesel.pass.bed.sorted > %s../inputData/ProbeBEDFiles/TOTAL_Probes_FINALCoordinates_all.bed' % (options.directory, options.directory, options.directory, options.directory, options.directory, options.directory)
	print cmd
	genutils.runCMD(cmd)
Beispiel #6
0
def run_pear(myData):
    # PEAR aligns/merges overlapping read pairs, which is the case that we have here
    myData['pearBase'] = myData['outDir'] + myData['sampleName'] + '.pear'

    cmd = 'pear --nbase -f %s -r %s -o %s' % (myData['r1fq'],myData['r2fq'],myData['pearBase'])
    
    myData['assembledFQ'] = myData['pearBase'] + '.assembled.fastq'
    myData['discardedFQ'] = myData['pearBase'] + '.discarded.fastq'
    myData['notAssemF'] = myData['pearBase'] + '.unassembled.forward.fastq'
    myData['notAssemR'] = myData['pearBase'] + '.unassembled.reverse.fastq'

    # check to see if should run
    outgz = myData['assembledFQ'] + '.gz'
    if os.path.isfile(outgz) is True:
        print 'found gzip output already, will not rerun'
        myData['assembledFQ'] += '.gz'
        myData['discardedFQ'] += '.gz'
        myData['notAssemF'] += '.gz'
        myData['notAssemR'] += '.gz'
    else:
        print cmd
        genutils.runCMD(cmd)
        cmd = 'gzip ' + myData['assembledFQ']
        print cmd
        genutils.runCMD(cmd)
        myData['assembledFQ'] += '.gz'

        cmd = 'gzip ' + myData['discardedFQ']
        print cmd
        genutils.runCMD(cmd)
        myData['discardedFQ'] += '.gz'

        cmd = 'gzip ' + myData['notAssemF']
        print cmd
        genutils.runCMD(cmd)
        myData['notAssemF'] += '.gz'
        
        cmd = 'gzip ' + myData['notAssemR']
        print cmd
        genutils.runCMD(cmd)
        myData['notAssemR'] += '.gz'        
    line = line.split()
    siteID = line[0]
    if siteID == 'siteID':
        continue
    if line[1] == 'NO_CANDIDATE':
        continue

    print line
    
    data = {}
    data['tmpDir'] = options.tmpDir
    if os.path.isdir(data['tmpDir']) is False:
        cmd = 'mkdir ' + data['tmpDir']
        print 'making tmp dir'
        print cmd
        genutils.runCMD(cmd)
    
    data['refGenomeFasta'] = refGenomeFasta
    
    data['siteID'] = siteID

    chrom = siteID.split('_')
    chrom = chrom[0:-1]
    chrom = '_'.join(chrom)

    p = int(siteID.split('_')[-1])
    startBp = p - regDelta
    endBp = p + regDelta
    data['chromName'] = chrom
    data['chromFragStart'] =startBp
    data['chromFragEnd'] = endBp
def print_pretty_alignment(data):    
    #left end is blue, right start is red
    data['3wayAlignFilePrettyName'] = data['genomeFragFileName'] + '.3wayalign.pretty'    
    data['3wayAlignFilePrettyNamePS'] = data['alignOutDir'] + '/' + data['siteID'] + '.3wayalign.pretty.ps'
    data['3wayAlignFilePrettyNamePDF'] = data['alignOutDir'] + '/' + data['siteID'] + '.3wayalign.pretty.pdf'


    outFile = open(data['3wayAlignFilePrettyName'],'w')
    outFile.write('Site ID: %s\n' % (data['siteID']))
    outFile.write('%s\t%s\n' % (data['contigName'],data['contigDir']))
    outFile.write('%s:%i-%i\n' % (data['chromName'],data['genomeFragAlignBegin'],data['genomeFragAlignEnd']))
    # left BP in chromFrag and Contig
    outFile.write('~color{0 0 1}end left match~color{default} chromFrag %i Contig %i\n' % (data['leftBpGenomeFragCoords'],data['leftBpContigCoord']))
    # right BP in chromFrag and Contig
    outFile.write('~color{1 0 0}start right match~color{default} chromFrag %i Contig %i\n' % (data['rightBpGenomeFragCoords'],data['rightBpContigCoord']))


    #go through and add in the colors
    # do the colors individually
    print 'ready to start'
    print data['leftBpContigCoord'],data['leftBpGenomeFragCoords']
    print data['rightBpContigCoord'],data['rightBpGenomeFragCoords']
    
    

    
    
    for i in range(0,len(data['genome3way'])):
        if data['left3wayPos'][i] == (data['leftBpContigCoord'] - data['contigLeftFragAlignBegin'] + 1):
            if data['left3way'][i] == '-':
                print 'left is -'
            else:
                data['left3way'][i] = '~color{0 0 1}' + data['left3way'][i] + '~color{default}'
                data['3wayParse'][i] = '~color{0 0 1}' + data['3wayParse'][i] + '~color{default}'
                print 'LEFT contig',i,data['left3wayPos'][i]

        if data['genome3wayPos'][i] == data['leftBpGenomeFragCoords'] and data['genome3way'][i] != '-':
            data['genome3way'][i] = '~color{0 0 1}' + data['genome3way'][i] + '~color{default}'
            print 'LEFT GENOME',i,data['genome3wayPos'][i]

        
        if data['right3wayPos'][i] == (data['rightBpContigCoord'] - data['contigRightFragAlignBegin'] +1):
            if data['right3way'][i] == '-':
                print i,'right is -'
            else:
                data['right3way'][i] = '~color{1 0 0}' + data['right3way'][i] + '~color{default}'
                data['3wayParse'][i] = '~color{1 0 0}' + data['3wayParse'][i] + '~color{default}'
                print i,'right contig',i,data['right3wayPos'][i]
        if data['genome3wayPos'][i] == data['rightBpGenomeFragCoords'] and data['genome3way'][i] != '-' :
            data['genome3way'][i] = '~color{1 0 0}' + data['genome3way'][i] + '~color{default}'
            print i,'RIGHT GENOME',i,data['genome3wayPos'][i]
        
    leftName =  'left   '
    rightName = 'right  '
    chromName = 'chrom  '
    passeName = '       '
    outFile.write('\n\n')
    # do it in runs of 50
    width = 70
    sliceS = 0
    sliceE = sliceS + width
    while True:
        if sliceS >= len(data['genome3way']):
            break
        if sliceE > len(data['genome3way']):
            sliceE = len(data['genome3way'])
        l = data['left3way'][sliceS:sliceE]
        g = data['genome3way'][sliceS:sliceE]
        r = data['right3way'][sliceS:sliceE]
        p = data['3wayParse'][sliceS:sliceE]
        
        l = leftName + ''.join(l)
        g = chromName + ''.join(g)
        r = rightName + ''.join(r)
        p = passeName + ''.join(p)
        
        outFile.write('%s\n%s\n%s\n%s\n\n' % (l,g,r,p))
        sliceS = sliceE
        sliceE = sliceS + width
    outFile.close()    

    print 'Clean up PS and PDF'
    cmd = 'rm ' + data['3wayAlignFilePrettyNamePS']
    print cmd
    genutils.runCMDNoFail(cmd)
    cmd = 'rm ' + data['3wayAlignFilePrettyNamePDF']
    print cmd
    genutils.runCMDNoFail(cmd)
    

    cmd = 'enscript %s -o %s -e~ -B -2r' % (data['3wayAlignFilePrettyName'],data['3wayAlignFilePrettyNamePS'])
    print cmd
    genutils.runCMD(cmd)
    
    cmd = 'ps2pdf ' + data['3wayAlignFilePrettyNamePS'] + ' ' + data['3wayAlignFilePrettyNamePDF']
    print cmd
    genutils.runCMD(cmd)
def run_miropeats(data):
    if 'miropeatSValue' in data:
        s = data['miropeatSValue']
    else:
        s = 80
        s = 40 # for the dogs...
        data['miropeatSValue'] = s
    
    
        
    data['miroOutPS'] = data['alignOutDir'] + '/' + 'miropeats.' + str(s) + '.ps'
    data['miroOutInfo'] = data['alignOutDir'] + '/' + 'miropeats.' + str(s) + '.out'
    
    if 'tmpDir' in data:
        tmpDir = data['tmpDir']
    else:
        tmpDir = '/home/jmkidd/kidd-lab-scratch/jmkidd-projects/tmp/'
    
    tmpGenome = tmpDir + 'genome.fa'
    tmpContig = tmpDir + 'contig.fa'
    tmpMRPS = tmpDir + 'tmp.MRPS'
    tmpMROUT = tmpDir + 'tmp.MROUT'
    
    cmd = 'cp %s %s' % (data['genomeFragFileName'],tmpGenome)
    print cmd
    genutils.runCMD(cmd)

    cmd = 'cp %s %s' % (data['contigSeqFileName'],tmpContig)
    print cmd
    genutils.runCMD(cmd)
    
    cmd = 'miropeats -s %i -onlyinter -o %s  -seq %s  -seq %s > %s'  % (s,tmpMRPS,tmpGenome,tmpContig,tmpMROUT)
    print cmd
    genutils.runCMD(cmd)
    
    #cp
    if os.path.isfile(tmpMRPS) is True:    
		cmd = 'cp %s %s' % (tmpMRPS,data['miroOutPS'])
		print cmd
		genutils.runCMD(cmd)
    else:
	    data['miroOutPS'] = 'FAILURE'

    cmd = 'cp %s %s' % (tmpMROUT,data['miroOutInfo'])
    print cmd
    genutils.runCMD(cmd)
    
    # clean up
    
    cmd = 'rm %s %s' % (tmpGenome,tmpContig)
    print cmd
    genutils.runCMD(cmd)
    
    if os.path.isfile(tmpMRPS) is True:
         cmd = 'rm ' + tmpMRPS
         print cmd
         genutils.runCMD(cmd)

    if os.path.isfile(tmpMROUT) is True:
         cmd = 'rm ' + tmpMROUT
         print cmd
         genutils.runCMD(cmd)
def get_genome_gaps(data,run=True):
    data['genomeFragGapsFileName'] = data['genomeFragFileName'] + '.gaps'
    cmd = 'get_gaps.pl ' + data['genomeFragFileName'] + ' > ' + data['genomeFragGapsFileName']
    if run is True:
        genutils.runCMD(cmd)    
def get_contig_gaps(data,run=True):
    data['contigSeqGapsFileName'] = data['contigSeqFileName'] + '.gaps'
    cmd = 'get_gaps.pl ' + data['contigSeqFileName'] + ' > ' + data['contigSeqGapsFileName']
    if run is True:
        genutils.runCMD(cmd)    
    get_genome_gaps(data,run)
def bwa_index_alleles(data):
    cmd = 'bwa-0.5.9 index %s' % (data['alleleFa'])
#    print cmd
    genutils.runCMD(cmd)
def make_alternative_seqs(data,bpOutTable,allelesBaseDir,fragmentExtension):
    alleleDir = allelesBaseDir + data['siteID']
    if os.path.isdir(alleleDir) is False:
        cmd = 'mkdir ' + alleleDir
        print cmd
        genutils.runCMD(cmd)
    
    alleleDir += '/'
    genomeLeftFa = alleleDir + 'genomeLeft.fa'
    genomeRightFa = alleleDir + 'genomeRight.fa'
    genomeWholeFa = alleleDir + 'genomeWhole.fa'    
    alleleFa = alleleDir + 'alleles.fa'
    data['alleleFa'] = alleleFa
    data['alleleDir'] = alleleDir
    gTSDs = data['rightBpChromCoords']
    gTSDe = data['leftBpChromCoords']
    gTSDl = gTSDe - gTSDs + 1
    data['gTSDl'] = gTSDl

    if gTSDl <= -1:  # deletion in chromosome
        print 'deletion of %i in genome' % gTSDl
        leftChromBp = data['leftBpChromCoords']
        leftChromStart = leftChromBp - fragmentExtension + 1
        rightChromBp = data['rightBpChromCoords']
        rightChromEnd = rightChromBp + fragmentExtension - 1       
        data['insSite'] = leftChromBp
                
#        print leftChromBp,leftChromStart,rightChromBp,rightChromEnd    
        region = data['chromName'] + ':' + str(leftChromStart) + '-' + str(leftChromBp)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeLeftFa
        genutils.runCMD(cmd)
        genomeLeftSeq = genutils.read_fasta_to_string(genomeLeftFa)
        genomeLeftSeq = genomeLeftSeq.upper()
        region = data['chromName'] + ':' + str(rightChromBp) + '-' + str(rightChromEnd)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeRightFa
        genutils.runCMD(cmd)
        genomeRightSeq = genutils.read_fasta_to_string(genomeRightFa)
        genomeRightSeq = genomeRightSeq.upper()
        # get the chrom sequence
        region = data['chromName'] + ':' + str(leftChromStart) + '-' + str(rightChromEnd)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeWholeFa
        genutils.runCMD(cmd)
        genomeWholeSeq = genutils.read_fasta_to_string(genomeWholeFa)
        genomeWholeSeq = genomeWholeSeq.upper()
        
        data['mapFragStart'] = leftChromStart
        data['mapFragEnd'] = rightChromEnd
        # since that BP is in contig
        contigStart = data['leftBpContigCoord'] # already last bp
        contigEnd = data['rightBpContigCoord']         
        contigSeq = data['contigSeqGenomeDir'][contigStart:contigEnd-1]

        # print out genome        
        outFile = open(alleleFa,'w')
        outFile.write('>%s\n' % (data['siteID']+'_genome'))
        gSeq = genomeWholeSeq
        gSeq = genutils.add_breaks_to_line(gSeq)
        outFile.write('%s\n' % gSeq)
        outFile.write('>%s\n' % (data['siteID']+'_insertion'))
        iSeq = genomeLeftSeq + contigSeq + genomeRightSeq
        iSeq = genutils.add_breaks_to_line(iSeq)
        outFile.write('%s\n' % iSeq)
        outFile.close()
#        print 'left',genomeLeftSeq
#        print 'right',genomeRightSeq
#        print 'contig',contigSeq
#        print len(contigSeq)
    elif  gTSDl >= 1:  # has TSD
#        print 'has TSD len %i' % gTSDl
        # note that they cross
        leftChromBp = data['rightBpChromCoords']
        leftChromStart = leftChromBp - fragmentExtension + 1
        rightChromBp = data['leftBpChromCoords']
        rightChromEnd = rightChromBp + fragmentExtension - 1
        
        data['insSite'] = leftChromBp
#        print leftChromBp,leftChromStart,rightChromBp,rightChromEnd    
        region = data['chromName'] + ':' + str(leftChromStart) + '-' + str(leftChromBp)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeLeftFa
        genutils.runCMD(cmd)
        genomeLeftSeq = genutils.read_fasta_to_string(genomeLeftFa)
        genomeLeftSeq = genomeLeftSeq.upper()
        region = data['chromName'] + ':' + str(rightChromBp) + '-' + str(rightChromEnd)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeRightFa
        genutils.runCMD(cmd)
        genomeRightSeq = genutils.read_fasta_to_string(genomeRightFa)
        genomeRightSeq = genomeRightSeq.upper()
        # get the chrom sequence
        region = data['chromName'] + ':' + str(leftChromStart) + '-' + str(rightChromEnd)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeWholeFa
        genutils.runCMD(cmd)
        
        data['mapFragStart'] = leftChromStart
        data['mapFragEnd'] = rightChromEnd
        genomeWholeSeq = genutils.read_fasta_to_string(genomeWholeFa)
        genomeWholeSeq = genomeWholeSeq.upper()
        
        # since that BP is in contig
        contigStart = data['leftBpContigCoord']  - gTSDl + 1  # to get over to right size, include the TSD
        contigEnd = data['rightBpContigCoord']  + gTSDl  - 1 # to get over to the right size, include the TSD        
        contigSeq = data['contigSeqGenomeDir'][contigStart:contigEnd-1]

        # print out genome        
        outFile = open(alleleFa,'w')
        outFile.write('>%s\n' % (data['siteID']+'_genome'))
        gSeq = genomeWholeSeq
        gSeq = genutils.add_breaks_to_line(gSeq)
        outFile.write('%s\n' % gSeq)
        outFile.write('>%s\n' % (data['siteID']+'_insertion'))
        iSeq = genomeLeftSeq + contigSeq + genomeRightSeq
        iSeq = genutils.add_breaks_to_line(iSeq)
        outFile.write('%s\n' % iSeq)
        outFile.close()
#        print 'left',genomeLeftSeq
#        print 'right',genomeRightSeq
#        print 'contig',contigSeq
#        print len(contigSeq)
    elif  gTSDl == 0:  # has no TSD 
#        print 'has TSD len %i' % gTSDl
        # note that they cross
        leftChromBp = data['leftBpChromCoords']
        leftChromStart = leftChromBp - fragmentExtension + 1
        rightChromBp = data['rightBpChromCoords']
        rightChromEnd = rightChromBp + fragmentExtension - 1        
        data['insSite'] = leftChromBp       
#        print leftChromBp,leftChromStart,rightChromBp,rightChromEnd    
        region = data['chromName'] + ':' + str(leftChromStart) + '-' + str(leftChromBp)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeLeftFa
        genutils.runCMD(cmd)
        genomeLeftSeq = genutils.read_fasta_to_string(genomeLeftFa)
        genomeLeftSeq = genomeLeftSeq.upper()
        region = data['chromName'] + ':' + str(rightChromBp) + '-' + str(rightChromEnd)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeRightFa
        genutils.runCMD(cmd)
        genomeRightSeq = genutils.read_fasta_to_string(genomeRightFa)
        genomeRightSeq = genomeRightSeq.upper()
        # get the chrom sequence
        region = data['chromName'] + ':' + str(leftChromStart) + '-' + str(rightChromEnd)
        cmd = 'samtools faidx ' + data['refGenomeFasta'] + ' ' + region + ' > ' + genomeWholeFa
        genutils.runCMD(cmd)        
        data['mapFragStart'] = leftChromStart
        data['mapFragEnd'] = rightChromEnd        
        genomeWholeSeq = genutils.read_fasta_to_string(genomeWholeFa)
        genomeWholeSeq = genomeWholeSeq.upper()
        
        # since that BP is in contig
        contigStart = data['leftBpContigCoord']  
        contigEnd = data['rightBpContigCoord']          
        contigSeq = data['contigSeqGenomeDir'][contigStart:contigEnd-1]

        # print out genome        
        outFile = open(alleleFa,'w')
        outFile.write('>%s\n' % (data['siteID']+'_genome'))
        gSeq = genomeWholeSeq
        gSeq = genutils.add_breaks_to_line(gSeq)
        outFile.write('%s\n' % gSeq)
        outFile.write('>%s\n' % (data['siteID']+'_insertion'))
        iSeq = genomeLeftSeq + contigSeq + genomeRightSeq
        iSeq = genutils.add_breaks_to_line(iSeq)
        outFile.write('%s\n' % iSeq)
        outFile.close()
#        print 'left',genomeLeftSeq
#        print 'right',genomeRightSeq
#        print 'contig',contigSeq
#        print len(contigSeq)
    else:
        print 'What TSD size?'
        print gTSDl
        sys.exit()
    # make out file
    nl = [data['siteID'],data['chromName'],data['insSite'],gTSDl,data['mapFragStart'],data['mapFragEnd'] ]
    nl = [str(i) for i in nl]
    nl = '\t'.join(nl) + '\n'
    bpOutTable.write(nl)
    bwa_index_alleles(data)
Beispiel #14
0
######## NOVEL CONTIGS ##########
#################################

print 'Now annotating novel contigs in fosmid...\n'

#novelContigFasta = '~/kidd-lab/ampend-projects/Novel_Sequence_Analysis/NovelSequence/novel.v2.fa.masked'
#New non-reundant Fasta
novelContigFasta = '~/kidd-lab/ampend-projects/Novel_Sequence_Analysis/RedundantNovelContigs/Final_chrNovel_Fasta/novelContigs_NonRedundant.fa.masked'
bottomRM = options.bottomRM
masked_bottomRM = bottomRM.replace(".out",".masked")
contigfile = 'BLAT_novelContigs_vs_fosmid.blat'
 
#cmd = 'blat -fine -minMatch=1 -minScore=10 -out=blast9 %s %s %s' % (novelContigFasta,masked_bottomRM,contigfile)
cmd = 'blat -noHead %s %s %s' % (novelContigFasta,masked_bottomRM,contigfile)
print cmd
genutils.runCMD(cmd) 
n = 0
#ypos = bottomLine - 0.25
exon_pos = gap_pos - 0.03
ypos = exon_pos

contigFile = open('BLAT_novelContigs_vs_fosmid.blat','r')

contigList = []

print 'NOVEL CONTIG start ypos', ypos 

for b in contigFile:
	b = b.rstrip()
	b = b.split()
	if b[0].isdigit() is False:
def run_repeatmasker(fastaFile):
    cmd = 'RepeatMasker --species human %s ' % (fastaFile)
    # change this to use other species or libraries
#    cmd = 'RepeatMasker --species dog %s ' % (fastaFile)
    if  os.path.isfile(fastaFile) is True:
        genutils.runCMD(cmd)