def parse_file_fasta_seqkey(file_fasta, hsh, options): if options.get('-a') == '': print_stderr('reading file into hash\n') _id = '' seq = '' running_1 = 0 FASTA = open_or_die2(file_fasta, 'rb') while True: l = FASTA.readline().strip() if not l: break m = re.match(r'^>(\S+)', l) if m: _id = m.group() seq = '' while True: ll = FASTA.readline().strip() if not ll: break mm = re.match(r'^>(\S+)', ll) if mm: cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') # ATTR: Performance issue below: # create_hash_key_chain(hsh, 0, seq) try: hsh[seq] = (hsh[seq]) + cnt except KeyError: hsh[seq] = cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) _id = mm.group() seq = '' continue seq += ll cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') create_hash_key_chain(hsh, 0, seq) hsh[seq] += cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) FASTA.close()
def read_handler(handle): global counter while True: rin = handle.readline().strip() if not rin: break counter += 1 m = re.match(r'^\>(.+)$', rin) if m: m = m.groups() _id = m[0] if re.search(r'\s+', _id): die('Error in line {}: The identifier\n {}\n\ncontains white spaces\n\n{}\n\nYou could run remove_white_space_in_id.py inputfile > newfile\nThis will remove everything from the id line after the first whitespace\n' .format(Nicenumber(counter), _id, hint)) else: create_hash_key_chain(hash_num, 0, _id) hash_num[_id] += 1 elif not re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]+)$', rin): die('Error in line {}: The sequence\n{}\n\ncontains characters others than [acgtunACGTUN]\n\n{}' .format(Nicenumber(counter), rin, hint))
def resolve_entry_file_mrd_permuted(score, permutation, refs, _hash, options): if permutation is None: print_stderr('The {} file is not properly formatted.\nMaybe it does not contain the lines with \"permutation int\"?\n'.format( options.get('-a') )) sys.exit(0) floor = int(math.floor(score)) create_hash_key_chain(_hash, 0, 'total', permutation, floor) _hash['total'][permutation][floor] += 1 if refs: create_hash_key_chain(_hash, 0, 'known', permutation, floor) _hash['known'][permutation][floor] += 1 else: create_hash_key_chain(_hash, 0, 'novel', permutation, floor) _hash['novel'][permutation][floor] += 1 for i in range(len(refs)): refs.pop()
def resolve_entry_file_mrd(score, refs, _hash): global hash_sig floor = int(math.floor(score)) create_hash_key_chain(_hash, 0, 'total', floor) _hash['total'][floor] += 1 if refs: create_hash_key_chain(_hash, 0, 'known', floor) _hash['known'][floor] += 1 for ref in refs: if hash_sig[ref] < floor: hash_sig[ref] = floor else: create_hash_key_chain(_hash, 0, 'novel', floor) _hash['novel'][floor] += 1 for i in range(len(refs)): refs.pop() # clear refs array
def insertfeature(db, strand, db_beg, db_end, freq): global hash_pos create_hash_key_chain(hash_pos, 0, db, strand, db_beg, db_end) hash_pos[db][strand][db_beg][db_end] += freq
def parse_file_arf(file_arf, options): global running, gscan, hash_edits FILE_ARF = open_or_die(file_arf, 'rb', 'can not open {}\n'.format(file_arf)) while True: line = FILE_ARF.readline() if not line: break m = re.match( r'^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_map_lng = int(m[1]) query_beg = m[2] query_end = int(m[3]) query_seq = m[4] db = m[5] db_map_lng = int(m[6]) db_beg = m[7] db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = int(m[11]) edit_string = m[12] running += 1 if options.get('-j') == '': (query_map_lng, query_end, query_seq, db_map_lng, db_end, db_seq, edits, edit_string) = remove_trailing_nts( query_map_lng, query_end, query_seq, db_map_lng, db_end, db_seq, edits, edit_string) if '-a' in options.keys() and int(options.get('-a')) < edits: continue if options.get('-b') and query_map_lng < int(options.get('-b')): continue if options.get('-c') and int(options.get('-c')) < query_map_lng: continue if options.get('-d') and query not in hash_queries_incl.keys(): continue if options.get('-e') and query in hash_queries_excl.keys(): continue if options.get('-f') and db not in hash_dbs_incl.keys(): continue if options.get('-g') and db in hash_dbs_excl.keys(): continue if not (options.get('-h') == '' or options.get('-i')): pprint('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'. format(query, query_map_lng, query_beg, query_end, query_seq, db, db_map_lng, db_beg, db_end, db_seq, strand, edits, edit_string)) continue if gscan: create_hash_key_chain(hash_edits, 0, query, edits) hash_edits[query][edits] += 1 else: evaluation = evaluate_query(query, edits, options) if evaluation: pprint( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n". format(query, query_map_lng, query_beg, query_end, query_seq, db, db_map_lng, db_beg, db_end, db_seq, strand, edits, edit_string))
def parse_file_arf(file_arf): ''' read through the signature blastparsed file, fills up a hash with information on queries (deep sequences) mapping to the current db (potential precursor) and resolve each potential precursor in turn ''' global db_old, hash_query FILENAME = open_or_die(file_arf, 'rb', 'could not open file {}\n'.format(file_arf)) while True: line = FILENAME.readline() if not line: break m = re.match( r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_lng = int(m[1]) query_beg = int(m[2]) query_end = int(m[3]) query_seq = m[4] db = m[5] db_lng = int(m[6]) db_beg = int(m[7]) db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = int(m[11]) edit_string = m[12] # only reads that map sense to the potential precursor are # considered if strand == "-": continue # if the new line concerns a new db (potential precursor) then the # old db must be resolved if db_old and db_old != db: # print(db, db_old) resolve_potential_precursor() # resolve the number of reads that the deep sequence represents freq = find_freq(query) # read information of the query (deep sequence) into hash create_hash_key_chain(hash_query, db_beg, query, 'db_beg') create_hash_key_chain(hash_query, db_end, query, 'db_end') create_hash_key_chain(hash_query, strand, query, 'strand') create_hash_key_chain(hash_query, freq, query, 'freq') hash_query[query]["db_beg"] = db_beg hash_query[query]["db_end"] = db_end hash_query[query]["strand"] = strand hash_query[query]["freq"] = freq db_old = db resolve_potential_precursor()