def set_1bp_list(): chromLens = genutils.read_chrom_len(chromLenFile) counts = {} for chromName in chromLens: counts[chromName] = [] # if chromName != "chr1": # continue for i in range(0,chromLens[chromName] + 1): counts[chromName].append("") return counts
def set_1bp_list(): chromLens = genutils.read_chrom_len(chromLenFile) counts = {} for chromName in chromLens: counts[chromName] = [] # if chromName != "chr1": # continue for i in range(0, chromLens[chromName] + 1): counts[chromName].append("") return counts
# Used for NA19240 import argparse import os import signal import glob import pandas as pd import numpy as np import math from NGS_utils import * import pickle import subprocess import genutils chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai' chromLens = genutils.read_chrom_len(chromLenFile) chromOrder = get_chromOrder("human") def create_snp_list(all_var_file): f = open(all_var_file, "r") snp_pos = [] snp_geno = [] snp_phase = [] for line in f: line = line.strip().split("\t") chr = line[0] # if chr != "chr1" and chr!="chr2": # break if chr != CHROM and chromOrder[chr] < chromOrder[CHROM]: continue
c = l.split()[0] chromOrder[c] = i return chromOrder def get_chromList(): # chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai' chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg18/genome-index/hg18.fa.fai' chr_list=[] inFile = open(chromLenFile,'r') for i,l in enumerate(inFile): l = l.rstrip() c = l.split()[0] chr_list.append(c) return chr_list chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai' chromLens = genutils.read_chrom_len(chromLenFile) class BedIterator: def __init__(self, filename): if filename[-2:]=='gz': self.file = gzip.open(filename, "r") else: self.file = open(filename, "r") def __iter__(self): return self def __next__(self): return self.next() def next(self): line = next(self.file) fields = line.rstrip().split() return tuple(fields)
def find_clone_cov(pool_name, sample_name): windowFile = '/home/jmkidd/kidd-lab-scratch/shiya-projects/indian_Jacob/script/new_genome_window.txt' windowSize = 1000.0 # assume each window is 1000 unmasked bp chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai' chromLens = genutils.read_chrom_len(chromLenFile) coverage = {} counts = {} pos1 = {} pos2 = {} print 'Initializing chrom lists' for chromName in chromLens: counts[chromName] = [] coverage[chromName] = [] pos1[chromName] = [] pos2[chromName] = [] f = open(windowFile, 'r') for line in f: line = line.rstrip() line = line.split("\t") c = line[0] if c == "EBV" or c == "eColiK12" or c == "pCC1FOS": continue b = int(line[1]) e = int(line[2]) counts[c].append(0) coverage[c].append(0) pos1[c].append(b) pos2[c].append(e) outDirBase = '/home/jmkidd/kidd-lab/jmkidd-projects/additional-fosmid-pools/results/pools/%s/' % ( sample_name) poolNames = pool_name.keys() poolNames.sort() minLen = 10000 minDP = 0.25 for pool in poolNames: baseDir = outDirBase + pool + '/' cloneFileName = baseDir + pool + '.markdup.clone' inFileName = cloneFileName + '.sel.%i.%.2f' % (minLen, minDP) f = open(inFileName, 'r') for line in f: line = line.rstrip() line = line.split() if line[0] == 'track': continue chr = line[0] start = int(line[1]) end = int(line[2]) cov = float(line[3]) size = int(line[4]) / 1000 find1 = find_pos(start - 1, pos1[chr]) find2 = find_pos(end, pos2[chr]) if find1 >= 0: for j in range(find1, find2 + 1): if j < len(counts[chr]): counts[chr][j] += 1 coverage[chr][j] += cov else: print line, find1, find2, len(counts[chr]) else: print 'not found', chr, start ''' f=open(windowFile,'r') i = 0 outFile = open("wgs_clone_coverage.txt","w") old_c="chr1" for line in f: line = line.rstrip() line = line.split("\t") c = line[0] if c!=old_c: i = 0 old_c = c if c=="EBV" or c=="eColiK12" or c=="pCC1FOS": continue b = int(line[1]) + 1 #make them all 1 based e = int(line[2]) outFile.write('%s\t%i\t%i\t%f\t%i\n' % (c,b,e,coverage[c][i],counts[c][i])) i +=1 dbfile=open('NA19240_wgs_clone_coverage_pickle','wb') pickle.dump(counts,dbfile) pickle.dump(coverage,dbfile) ''' return counts, coverage
def find_clone_cov(pool_name,sample_name): windowFile = '/home/jmkidd/kidd-lab-scratch/shiya-projects/indian_Jacob/script/new_genome_window.txt' windowSize = 1000.0 # assume each window is 1000 unmasked bp chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai' chromLens = genutils.read_chrom_len(chromLenFile) coverage = {} counts = {} pos1 = {} pos2 = {} print 'Initializing chrom lists' for chromName in chromLens: counts[chromName] = [] coverage[chromName] = [] pos1[chromName]=[] pos2[chromName]=[] f=open(windowFile,'r') for line in f: line = line.rstrip() line = line.split("\t") c = line[0] if c=="EBV" or c=="eColiK12" or c=="pCC1FOS": continue b = int(line[1]) e = int(line[2]) counts[c].append(0) coverage[c].append(0) pos1[c].append(b) pos2[c].append(e) outDirBase= '/home/jmkidd/kidd-lab/jmkidd-projects/additional-fosmid-pools/results/pools/%s/' %(sample_name) poolNames = pool_name.keys() poolNames.sort() minLen = 10000 minDP = 0.25 for pool in poolNames: baseDir = outDirBase+pool + '/' cloneFileName = baseDir + pool + '.markdup.clone' inFileName = cloneFileName + '.sel.%i.%.2f' % (minLen,minDP) f = open(inFileName,'r') for line in f: line = line.rstrip() line = line.split() if line[0] == 'track': continue chr = line[0] start = int(line[1]) end = int(line[2]) cov = float(line[3]) size = int(line[4])/1000 find1 = find_pos(start-1,pos1[chr]) find2 = find_pos(end,pos2[chr]) if find1>=0: for j in range(find1,find2+1): if j < len(counts[chr]): counts[chr][j]+=1 coverage[chr][j]+=cov else: print line,find1,find2,len(counts[chr]) else: print 'not found',chr,start ''' f=open(windowFile,'r') i = 0 outFile = open("wgs_clone_coverage.txt","w") old_c="chr1" for line in f: line = line.rstrip() line = line.split("\t") c = line[0] if c!=old_c: i = 0 old_c = c if c=="EBV" or c=="eColiK12" or c=="pCC1FOS": continue b = int(line[1]) + 1 #make them all 1 based e = int(line[2]) outFile.write('%s\t%i\t%i\t%f\t%i\n' % (c,b,e,coverage[c][i],counts[c][i])) i +=1 dbfile=open('NA19240_wgs_clone_coverage_pickle','wb') pickle.dump(counts,dbfile) pickle.dump(coverage,dbfile) ''' return counts,coverage