def parse_file_fasta_seqkey(file_fasta, hsh, options): if options.get('-a') == '': print_stderr('reading file into hash\n') _id = '' seq = '' running_1 = 0 FASTA = open_or_die2(file_fasta, 'rb') while True: l = FASTA.readline().strip() if not l: break m = re.match(r'^>(\S+)', l) if m: _id = m.group() seq = '' while True: ll = FASTA.readline().strip() if not ll: break mm = re.match(r'^>(\S+)', ll) if mm: cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') # ATTR: Performance issue below: # create_hash_key_chain(hsh, 0, seq) try: hsh[seq] = (hsh[seq]) + cnt except KeyError: hsh[seq] = cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) _id = mm.group() seq = '' continue seq += ll cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') create_hash_key_chain(hsh, 0, seq) hsh[seq] += cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) FASTA.close()
def remove_adapter(_id, seq, prefix): seq = tr(seq, '[acgtun.]', '[ACGTTNN]') seq_clipped = None pattern = r'(\w+)' + prefix m = re.search(pattern, seq) if m: seq_clipped = m.groups()[0] elif substr(seq, 0, 6) == prefix: seq_clipped = prefix else: finish = 0 while not finish and len(prefix) > 0: # ATTR: chop $prefix prefix = prefix[:-1] mm = re.search(r'(\w+){}$'.format(prefix), seq) if mm: seq_clipped = mm.groups()[0] finish = 1 if not seq_clipped: seq_clipped = seq # print ">$id\n$seq_clipped\n"; print('>{}\n{}'.format(_id, seq_clipped))
def com(sequence): return tr(sequence, 'acgtuACGTU', 'TGCAATGCAA')
continue if re.match(r'^\s*$', l): continue if novel or known: l = l.strip() line = re.split('\t', l) coord = 'na' if len(line) > 16 and line[16]: coord = line[16] if known: if float(line[1]) >= thres and float(line[1]) < maxs: if options.get('-d') == '': line[seqcol] = tr(line[seqcol], 'uU', 'tT') if options.get('-p') == '': m = re.search(r'\|([a-zA-Z0-9_-]*)$', line[0]) if m: line[0] = m.groups()[0] OUT.write(">{}\n{}\n".format(line[0], line[seqcol].upper())) else: OUT.write(">{}_{}_x{}_coord:{}_score:{}\n{}\n".format( line[0], line[9], line[5], coord, line[1], line[seqcol].upper())) coord = coord.strip()
try: IN = open(sys.argv[1], 'rb') except IOError: print('cannot open file {}'.format(sys.argv[1])) sys.exit(-1) while True: l = IN.readline() if not l: break line = re.split(r'\t', l) if line[1] == '-': line[4] = str_reverse(line[4]) line[4] = tr(line[4], 'ACGTN', 'TGCAN') gseq = ssplit(line[4].lower()) edit = ssplit('m' * len(line[4])) mm = 0 if line[7]: changes = re.split(r',', line[7]) for change in changes: match = re.search(r'(\d+):(\w+)\>\w+', change) if match: match = match.groups() mm += 1 gseq[int(match[0])] = match[1].lower() edit[int(match[0])] = 'M'
def parse_file_struct(file_struct): global db_old FILE_STRUCT = open_or_die(file_struct, 'rb', 'can not open file {}\n'.format(file_struct)) while True: line = FILE_STRUCT.readline() if not line: break line = line.strip() m = re.match(r'^>(\S+)\s*(.*)', line) if m: m = m.groups() _id = m[0] desc = m[1] seq = "" struct = "" mfe = "" while True: line2 = FILE_STRUCT.readline() if not line2: break line2 = line2.strip() mm = re.match(r'^>(\S+)\s*(.*)', line2) if mm: hash_desc[_id] = desc hash_seq[_id] = seq hash_struct[_id] = struct hash_mfe[_id] = mfe _id = mm.groups()[0] desc = mm.groups()[1] seq = "" struct = "" mfe = "" continue m3 = re.match(r'^\w', line2) if m3: line2 = tr(line2, 'uU', 'tT') seq += line2 m3 = re.search(r'((\.|\(|\))+)', line2) if m3: struct += m3.groups()[0] m3 = re.search(r'\((\s*-\d+\.\d+)\)', line2) if m3: mfe = m3.groups()[0] hash_desc[_id] = desc hash_seq[_id] = seq hash_struct[_id] = struct hash_mfe[_id] = mfe # print('\n'.join(sorted(hash_struct.values()))) # print('\n'.join(sorted(hash_desc.keys()))) FILE_STRUCT.close()
if mm: remove_adapter(_id, seq, prefix) _id = mm.groups()[0] seq = '' continue seq += ll remove_adapter(_id, seq, prefix) FASTA.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('file_fasta') parser.add_argument('seq_adapter') if len(sys.argv) != 3: print(usage) sys.exit(-1) args = parser.parse_args(sys.argv[1:3]) file_fasta = args.file_fasta seq_adapter = args.seq_adapter seq_test = "TCGTATGCCGTCTTCTGCTTGT" prefix = substr(seq_adapter, 0, 6) prefix = tr(prefix, '[acgtun.]', '[ACGTTNN]') remove_adapters(file_fasta, prefix) sys.exit(0)