def read_rep(): dn = 'rep.dict.pkl' if cmn.filexist(dn): print('loading repeats using precomputed data...') return cmn.pickle_read(dn) freps = cmn.cmd2lines('ls annotation_repeats/*.gff3') repdict = {} for frep in freps: for line in cmn.file2lines(fn): items = line.strip().split() scaf = items[0] if scaf not in repdict: repdict[scaf] = set([]) i, j = list(map(int, items[3:5])) repdict[scaf] = repdict[scaf] | set(range(i, j)) cmn.pickle_write(repdict, dn) return repdict
return [char1, char1] else: char1, char2 = flist[-2:] print(count_dict, char1, char2, count_dict, cutoff, 'twochars') return [char1, char2] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ try: fblast, fread = sys.argv[1:3] except: print('*.py blastExon.dict.pkl readSp.dict.pkl', file=sys.stderr) sys.exit() blast_dict = cmn.pickle_read(fblast) #reads[sp][name] = seq read_dict = cmn.pickle_read(fread) splist = set(read_dict.keys()) #read in ranges for exons #{'COX1': readID: {2:(1,2,3)}} #TODO: fill in missing stack_dict = {} exon_lengths = {} for exon in blast_dict: info = blast_dict[exon] stack_dict[exon] = {}
sys.exit() cwd = os.getcwd() #subsetIDs = set(cmn.getid(fsubset)) subsetJobs = set([ cmn.lastName(line.replace('sbatch', '').strip())[4:-4] for line in cmn.file2lines(fsubset) ]) #1. read in info fsams = cmn.cmd2lines('ls %s/*/*/*.sam' % mapdir) #print fsams samdirs = set(['/'.join(fsam.split('/')[:-2]) for fsam in fsams]) #print samdirs require_refs = cmn.pickle_read(freq) fq_dict = {} refdict = {} #1. tell by reftable #make the requirement by the reftable for line in cmn.file2lines(freftable): items = line.strip().split() sp = items[0] fastqs = items[1].split(',') fq_dict[sp] = fastqs # check for reference #2. tell by best mapping for samdir in samdirs: sp = cmn.lastName(samdir)
sys.path.append(python_lib) import cmn import ete3 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] fname = sys.argv[2] except: print("Usage: *.py", file=sys.stderr) sys.exit() nameDict = cmn.pickle_read(fname) t = ete3.Tree(fn) for node in t: name = node.name node.name = nameDict[name] dn = cmn.lastName(fn) + '.mapnamed' cmn.write_file(t.write(), dn)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py 1708_mapped.pileup", file=sys.stderr) sys.exit() new = [] try: coding = cmn.pickle_read('coding.indexes.pkl') except: print('do not find index file for coding region') print('would not label coding positions') coding = set([]) with open(fn) as fp: for line in fp: try: scaffold, index, ref_base, count, read_stack, qual_stack = line.strip( ).split() except: #coverage is 0 items = line.strip().split() index, ref_base, count = items[1:4] print(index, ref_base, count, '0 lowCoverage(0)')
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fns = sys.argv[1:] except: print("Usage: *.py read_filelist", file=sys.stderr) sys.exit() #read in data fdict = 'blastBySp.dict.pkl' sp_dict = cmn.pickle_read(fdict) #get the read ID, and the exon of it good_IDs = {} for sp in sp_dict: lines = sp_dict[sp] for line in lines: readID = line.split()[2] print('readID', readID) good_IDs[readID] = sp #get the reads and split them into exons #fns = cmn.getid(fn) rdict = {} for fn in fns: