def seqblock2alignment(aligned, name, seq): #tranform the alinged into a dict for better reference align_dict = {} for i, j in aligned: if j != None: align_dict[j] = i #subjct_range = [i[1] for i in aligned if i[1]!=None] #right = max(subjct_range) #left = min(subjct_range) right = max(align_dict) left = min(align_dict) iLeft = min(align_dict.values()) iRight = max(align_dict.values()) if iRight - iLeft + 1 != len(align_dict): print('detect deletion!') cmn.append_file(name + '\n', 'hasDeletion') iLeft = align_dict[left] seq = list(seq) #print aligned while (iLeft != 0): iLeft -= 1 left -= 1 #print iLeft seq[iLeft] = seq[iLeft].lower() align_dict[left] = iLeft iRight = align_dict[right] while (iRight < len(seq) - 1): iRight += 1 right += 1 seq[iRight] = seq[iRight].lower() align_dict[right] = iRight aln = [] for j in range(right): #j += 1 try: i = align_dict[j] if i == None: char = 'N' else: char = seq[i] except KeyError: char = '-' aln.append(char) return ''.join(aln)
def parse_inserted_gap(ID, seq, label): fn = 'sampleRun_%s/bait_insertion' % ID #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')): if cmn.filexist(fn): #lines = cmn.file2lines(fn) #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:])) #Ngap = 0 #for line in lines: # items = line.strip().split() # Ngap += len(items[-1]) #check what is the right range of sequence print('runing blast to fix %s' % ID) checkSeq = seq.replace('-', 'N').strip('N') fquery = 'tmpInput.fa' fasta = '>input\n%s\n' % checkSeq cmn.write_file(fasta, fquery) dn = 'tmpBr_%s.txt' % label cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn cmn.run(cmd) isFixed = False for line in cmn.file2lines(dn): items = line.strip().split() #print items qstart, qend, sstart, send = list(map(int, items[2:6])) if sstart == 1 and send == 658 and qstart == 21: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 658: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if sstart == 2 and send == 655 and qstart == 22: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 654: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if not isFixed: cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt') return seq
def get_query_sequence(seqDict, genus, sp): #1. anything in Eudamine file has higher priority #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt' #cmd = 'grep %s %s' % (sp, fEud) #lines = cmn.cmd2lines(cmd) #if len(lines) == 1: # name = lines[0].split()[0] # seq = seqDict[name] # fasta = '>%s\n%s\n' % (name, seq) # qlen = len(seq.replace('N', '')) # print 'pick %s for %s %s' % (name, genus, sp) # return fasta, qlen names = list(seqDict.keys()) #try to look up the exact match first expected_name = '%s_%s' % (genus, sp) tmp = [name for name in names if name.upper() == expected_name.upper()] if len(tmp) != 0: name = tmp[0] print('found exact match %s' % name) seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) return fasta, qlen #look it up in other files good_names = [name for name in names # if genus.upper() in name.upper().split('_')] if genus.upper() == name.upper().split('_')[0]] useGenus = False if len(good_names) > 0: useGenus = True cmn.run('rm pickingLog.txt 2> /dev/null') if len(good_names) == 0:#sp is just 'sp' print('can not find barcode for genus keyword "%s"' % genus) good_names = names cmn.write_file('noGenus\n', 'pickingLog.txt') if len(good_names) > 1: #try to refine it tmp = [name for name in good_names if sp.upper() in name.upper().split('_')] if len(tmp) != 0: good_names = tmp else: cmn.append_file('noSpecies\n', 'pickingLog.txt') ############################################# ####new here, auto pick sequences for those has no info ############################################# if cmn.filexist('pickingLog.txt'): print('automatically pick bait by fastq similarity') fsp = 'restricted_genus.info' if useGenus and (not cmn.filexist(fsp)): cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist %s' % genus else: cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist ' cmn.run(cmd) good_names = cmn.file2lines('picked_bait.txt') cmn.write_file('pickClosed\n', 'pickingLog.txt') ############################################# ############################################# ############################################# #try to see if type species is there tmp = [name for name in good_names if name[0] == '*'] if len(tmp) != 0: good_names = tmp else: tmp = [name for name in good_names if '*' in name] if len(tmp) != 0: good_names = tmp #then randomly pick one, get the max length ones name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-'))) #name = name.replace('/', '_') seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) print('pick %s for %s %s' % (name, genus, sp)) return fasta, qlen
for fmap in maplist: sp = cmn.lastName(fmap).split('_')[0] prot_dict = {} with open(fmap) as fp: for i, line in enumerate(fp): try: prot = codingI[i] except KeyError: continue #reach here if protein found a, b = line.strip().split() try: prot_dict[prot].append((a, b)) except KeyError: prot_dict[prot] = [(a, b)] for prot in prot_dict: dn = '%s/%s.fa' % (outdir, prot) alist = prot_dict[prot] new = [] for i in range(2): seq = ''.join([each[i] for each in alist]) name = '%s_cp%s' % (sp, i+1) fasta = '>%s\n%s\n' % (name, seq) cmn.append_file(fasta, dn) print('finish processing %s' % fmap)
name, iii, subName = record2name(record) q = record.qual[i:j] s = record.seq[i:j] if iii == 1: s = reverse_strand(s) q = q[::-1] fq = '@%s\n%s\n+\n%s\n' % (subName, s, q) print(name, subName, i, j, record.seq, s) if name not in rdict: rdict[name] = [None, None] rdict[name][iii] = fq print('Nbad: %s; Ntotal %s;' % (Nbad, Ntotal)) fq1 = '%s_R1.fq' % outlabel fq2 = '%s_R2.fq' % outlabel fsingle = '%s_singleton.fq' % outlabel for fn in [fq1, fq2, fsingle]: cmn.run('rm %s' % fn) for name in rdict: alist = rdict[name] if alist.count(None) == 0: cmn.append_file(alist[0], fq1) cmn.append_file(alist[1], fq2) else: for each in alist: if each != None: cmn.append_file(each, fsingle)
cmn.run('rm bait_insertion 2> /dev/null') hasInsertion = False for key in indel_dict: print(key, len(indel_dict[key])) leftI, rightI = key cov = (pCoverage[leftI] + pCoverage[rightI]) / 2.0 indel_depth = len(indel_dict[key]) insertion_info = [key, indel_depth, cov] if indel_depth > 0.5 * cov: hasInsertion = True count_dict = Counter(indel_dict[key]) maxChar = max(count_dict, key=lambda x: count_dict[x]) insertion_info.append(maxChar) insertion_info = '\t'.join(map(str, insertion_info)) cmn.append_file(insertion_info + '\n', 'bait_insertion') print('insert between ', insertion_info) for name in bait_names: bait_dict[name][leftI] += maxChar #just undergo one round of adding gap if not hasInsertion: print('No need to re-run bwa because no insertion in query') else: N = cmn.cpu_check() print('re-run bwa due to insertion') frefs = update_baits(bait_dict) fq_groups = group_fq(cmn.file2lines('fqlist'))
def get_query_sequence(seqDict, genus, sp): #1. anything in Eudamine file has higher priority #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt' #cmd = 'grep %s %s' % (sp, fEud) #lines = cmn.cmd2lines(cmd) #if len(lines) == 1: # name = lines[0].split()[0] # seq = seqDict[name] # fasta = '>%s\n%s\n' % (name, seq) # qlen = len(seq.replace('N', '')) # print 'pick %s for %s %s' % (name, genus, sp) # return fasta, qlen names = list(seqDict.keys()) #try to look up the exact match first expected_name = '%s_%s' % (genus, sp) tmp = [name for name in names if name.upper() == expected_name.upper()] if len(tmp) != 0: name = tmp[0] print('found exact match %s' % name) seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) return fasta, qlen #look it up in other files good_names = [ name for name in names # if genus.upper() in name.upper().split('_')] if genus.upper() == name.upper().split('_')[0] ] cmn.run('rm pickingLog.txt') if len(good_names) == 0: #sp is just 'sp' print('can not find barcode for genus keyword "%s"' % genus) good_names = names cmn.write_file('noGenus\n', 'pickingLog.txt') if len(good_names) > 1: #try to refine it tmp = [ name for name in good_names if sp.upper() in name.upper().split('_') ] if len(tmp) != 0: good_names = tmp else: cmn.append_file('noSpecies\n', 'pickingLog.txt') #try to see if type species is there tmp = [name for name in good_names if name[0] == '*'] if len(tmp) != 0: good_names = tmp else: tmp = [name for name in good_names if '*' in name] if len(tmp) != 0: good_names = tmp #then randomly pick one, get the max length ones name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-'))) #name = name.replace('/', '_') seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) print('pick %s for %s %s' % (name, genus, sp)) return fasta, qlen