def main(): parser = argparse.ArgumentParser(description='fixname - Fix hit name in the blastlist') parser.add_argument('input_file') parser.add_argument('-o', '--output', dest='output_file', help='output file name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') args = parser.parse_args() if args.output_file is None: args.output_file = args.input_file + '_out_' + name.genid() + '.fix' hitname = re.compile('.*?(gi\|\d*?\|.*?\|.*?\|)(.*)') with open(args.input_file, 'r') as fin, open(args.output_file, 'w') as fw: for linum, line in enumerate(fin, start=1): if line.lstrip() == '' or line.lstrip()[0] in ('#', 'a'): fw.write(line) fw.flush() else: data = line.split('\t') match = hitname.match(data[26]) if match is None: print('No mathced name in line ' + str(linum) + '.') print('Please have a check.') sys.exit() else: data[4] = match.group(1) data[26] = match.group(1) + match.group(2) + '\n' fw.write('\t'.join(data)) fw.flush()
def main(): parser = argparse.ArgumentParser(description='fa2lens - Extract length data from a fasta file') parser.add_argument('input_file') parser.add_argument('-s', '--sep', dest='sep', default='\n', help='seperator (default: newline)') parser.add_argument('-o', '--output', dest='output_file', help='output file name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') args = parser.parse_args() if args.output_file is None: args.output_file = args.input_file + '_out_' + name.genid() + '.leng.txt' with open(args.input_file, 'r') as fin, open(args.output_file, 'w') as fw: records = map(str, map(len, list(SeqIO.parse(fin, 'fasta')))) fw.write(args.sep.join(records)) fw.flush()
def main(): parser = argparse.ArgumentParser(description='blastnol - Find the non-overlapping hits in the blast result') parser.add_argument('input_file') parser.add_argument('-o', '--output-directory', dest='output_dir', help='output directory name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') parser.add_argument('-q', '--query-sequence', dest='query_fa', help='fasta file of query sequence. If this option is specified, the script will ' 'generate a new fasta file that contains truncated long sequences.') args = parser.parse_args() if args.output_dir is None: args.output_dir = args.input_file + '_out_' + name.genid() else: args.output_dir = args.output_dir.rstrip('/') if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.query_fa is not None: query_fa = dict(SeqIO.index(args.query_fa, 'fasta')) fw_fa = open(args.output_dir + '/truncated.fa', 'w') with open(args.output_dir + '/sort.temp', 'w') as fwsort: awk_cmd = "awk -F'\t' 'int($1) { print $0 }' " + args.input_file sort_cmd = "sort -t$'\t' -k9g,9 -k4d,4 -k18g,18 -k22gr,22 -k19gr,19 -k26gr,26 -k6gr" awk_proc = Popen(awk_cmd, stdout=PIPE, executable='/bin/bash', shell=True) sort_proc = Popen(sort_cmd, stdin=awk_proc.stdout, stdout=fwsort, executable='/bin/bash', shell=True) sort_proc.communicate() seq = {} with open(args.output_dir + '/sort.temp', 'r') as fi: for line in fi: data = line.split('\t') query_name = data[3] hit_name = data[4] query_strand = int(data[8]) if query_strand < 0: query_name = '-' + query_name query_hsp_start = int(data[7]) query_hsp_end = int(data[6]) else: query_hsp_start = int(data[6]) query_hsp_end = int(data[7]) if query_name in seq: for i in range(len(seq[query_name])): if hit_name in seq[query_name][i][0]: seq[query_name][i][1].append((query_hsp_start, query_hsp_end, [line])) break else: seq[query_name].append((hit_name, [(query_hsp_start, query_hsp_end, [line])])) else: seq.update({query_name: [(hit_name, [(query_hsp_start, query_hsp_end, [line])])]}) # Combine hsps for query_name in seq: hit_rank = 1 for i in range(len(seq[query_name])): hit = seq[query_name][i] if len(hit[1]) > 1: # Combine hsps pos_start, pos_end, lines = combine_hsps(hit[1]) seq[query_name][i] = ([hit[0]], [(pos_start, pos_end)], [lines], [hit_rank]) else: seq[query_name][i] = ([hit[0]], [(hit[1][0][0], hit[1][0][1])], [hit[1][0][2]], [hit_rank]) hit_rank += 1 # Sort hits by start position for query_name in seq: seq[query_name] = sorted(seq[query_name], key=get_start_pos) # Check overlap for query_name, hits in seq.items(): while len(hits) > 1: position = calculate.get_non_overlap((hits[0][1][0][0], hits[0][1][-1][1]), (hits[1][1][0][0], hits[1][1][0][1])) if position is not None: # The two sequences are non-overlapping, combine them seq[query_name][0] = (hits[0][0] + hits[1][0], hits[0][1] + hits[1][1], hits[0][2] + hits[1][2], hits[0][3] + hits[1][3]) seq[query_name].pop(1) else: # Compare the hit rank to determine which one is retained if seq[query_name][0][3][-1] < seq[query_name][1][3][0]: # Discard the next hit seq[query_name].pop(1) else: # Discard the last hit, and join the next hit seq[query_name][0][0].pop(-1) seq[query_name][0][1].pop(-1) seq[query_name][0][2].pop(-1) seq[query_name][0][3].pop(-1) seq[query_name][0] = (hits[0][0] + hits[1][0], hits[0][1] + hits[1][1], hits[0][2] + hits[1][2], hits[0][3] + hits[1][3]) seq[query_name].pop(1) # Write data with open(args.output_dir + '/hit_cover.tsv', 'w') as fw: query_num = 0 query_num_cover_eq_two = 0 query_num_cover_eq_three = 0 query_num_cover_ge_four = 0 hit_set = set() hr = header.blastlist() fw.write(hr.get_all_tab() + '\n') fw.flush() for query, hits in seq.items(): query = query.split(' ')[0] if len(hits[0][0]) > 1: query_num += 1 if len(hits[0][0]) == 2: query_num_cover_eq_two += 1 elif len(hits[0][0]) == 3: query_num_cover_eq_three += 1 else: query_num_cover_ge_four += 1 for lines in hits[0][2]: for line in lines: hit_set.add(line.split('\t')[4]) fw.write(line) fw.flush() if args.query_fa is not None: # Truncated queries segment_num = 0 for pos_start, pos_end in hits[0][1]: fw_fa.write('>' + query + '_s' + str(segment_num) + '\n') fw_fa.write(query_fa[query].seq.tostring()[pos_start - 1:pos_end] + '\n') fw_fa.flush() segment_num += 1 query_fa.pop(query) else: if args.query_fa is not None: # Full-sequece queries fw_fa.write('>' + query + '\n') fw_fa.write(query_fa[query].seq.tostring() + '\n') query_fa.pop(query) if args.query_fa is not None: # No-hit queries for query in query_fa: fw_fa.write('>' + query + '\n') fw_fa.write(query_fa[query].seq.tostring() + '\n') fw_fa.flush() fw_fa.close() fw.write('\n') fw.write('# Number of queries that cover >= 2 hits: ' + str(query_num) + '\n') fw.write('# Cover 2 hits: ' + str(query_num_cover_eq_two) + '\n') fw.write('# Cover 3 hits: ' + str(query_num_cover_eq_three) + '\n') fw.write('# Cover >= 4 hits: ' + str(query_num_cover_ge_four) + '\n') fw.write('# Number of covered hits: ' + str(len(hit_set)))
def main(): proglog = logmsg.message(prog='fetchfa', cmd=' '.join(sys.argv)) parser = argparse.ArgumentParser(description='fetchfa - Fetch fasta files from Entrez') parser.add_argument('input_file', nargs='?') parser.add_argument('-d', '--db', dest='database', default='protein', help='database (default: protein)') parser.add_argument('-q', '--query', dest='query_id', help='accessions to be fetched. If this option is specifid, the script will use the values ' 'to fetch data, and no input file is required to be handled.') parser.add_argument('-o', '--output', dest='output', default='fetchfa_out_' + name.genid(), help='output directory or file name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') parser.add_argument('-l', '--log', dest='log_file', help='log file name') args = parser.parse_args() if args.log_file is None: fwlog = open(args.output + '.log', 'w') else: fwlog = open(args.log_file, 'w') for i in proglog.start_message(): fwlog.write(i) fwlog.flush() Entrez.email = name.genid() + '@example.com' if args.query_id is not None: with open(args.output + '.fa', 'w') as fw, open(args.output + '.log', 'w') as fwlog: handle = Entrez.efetch(db=args.database, id=args.query_id, rettype='fasta', retmode='text') fw.write(handle.read()) fw.flush() fwlog.write('# Fetched sequences: ' + str(len(args.query_id.split(','))) + '\n') fwlog.write('#\n') for i in proglog.end_message(): fwlog.write(i) fwlog.flush() else: if not os.path.exists(args.output): os.makedirs(args.output) with open(args.input_file, 'r') as fin: query_num = 0 for line in fin: if line.lstrip() == '' or line.lstrip()[0] in ('#', 'a'): continue query_num += 1 with open(os.path.abspath(args.output) + '/' + line.split('\t')[0] + '.fa', 'w') as fw: alist = line.rstrip().split('\t')[1].split(',') while len(alist) > 30: alist_part = alist[0:30] alist = alist[30:len(alist)] handle = Entrez.efetch(db=args.database, id=','.join(alist_part), rettype='fasta', retmode='text') fw.write(handle.read()) fw.flush() handle.close() handle = Entrez.efetch(db=args.database, id=','.join(alist), rettype='fasta', retmode='text') fw.write(handle.read()) fw.flush() handle.close() fwlog.write('# Fetched queries: ' + str(query_num) + '\n') fwlog.write('#\n') for i in proglog.end_message(): fwlog.write(i) fwlog.flush() fwlog.close()
def main(): parser = argparse.ArgumentParser(description='commutate - Find the common mutation profile') parser.add_argument('input', nargs='*') parser.add_argument('-o', '--output-directory', dest='output', default='commutate_out_' + name.genid(), help='output directory. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') args = parser.parse_args() args.output = args.output.rstrip('/') if not os.path.exists(args.output): os.makedirs(args.output) source = [] source_res_eq_susp = {} source_rec_eq_susp = {} source_rec_eq_res = {} for afile in args.input: source.append(afile) source_res_eq_susp[afile] = {} source_rec_eq_susp[afile] = {} source_rec_eq_res[afile] = {} with open(afile, 'r') as fin: for line in fin: if line.lstrip() == '' or line.lstrip()[0] in ('#', 'm'): continue data = line.rstrip().split('\t') if int(data[3]) > 0: source_res_eq_susp[afile].update({data[1]: set(data[9].split(','))}) if int(data[4]) > 0: source_rec_eq_susp[afile].update({data[1]: set(data[10].split(','))}) if int(data[5]) > 0: source_rec_eq_res[afile].update({data[1]: set(data[11].split(','))}) common_hitname_res_eq_susp = get_common_hitname(source_res_eq_susp) common_mutation_profile_res_eq_susp = get_common_mutate(source_res_eq_susp, common_hitname_res_eq_susp) common_hitname_rec_eq_susp = get_common_hitname(source_rec_eq_susp) common_mutation_profile_rec_eq_susp = get_common_mutate(source_rec_eq_susp, common_hitname_rec_eq_susp) common_hitname_rec_eq_res = get_common_hitname(source_rec_eq_res) common_mutation_profile_rec_eq_res = get_common_mutate(source_rec_eq_res, common_hitname_rec_eq_res) writefile(args.output + '/common_mutation_profile_res_eq_susp.txt', common_mutation_profile_res_eq_susp) writefile(args.output + '/common_mutation_profile_rec_eq_susp.txt', common_mutation_profile_rec_eq_susp) writefile(args.output + '/common_mutation_profile_rec_eq_res.txt', common_mutation_profile_rec_eq_res)
def main(): proglog = logmsg.message(prog='blast2accmap', cmd=' '.join(sys.argv)) parser = argparse.ArgumentParser(description='blast2accmap - Extract names of query and hit sequences') parser.add_argument('input_file') parser.add_argument('-e', '--evalue', dest='ev_thresh', type=float, default=0.01, help='evalue thresh (default: 0.01)') parser.add_argument('-t', '--min_hit_num', dest='min_hit_num', type=int, default=1, help='minimum number of hit sequences (default: 1)') parser.add_argument('-o', '--output', dest='output_file', help='output file name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') args = parser.parse_args() if args.output_file is None: args.output_file = args.input_file + '_out_' + name.genid() + 'blastaccmap' total_query_num = 0 parsed_query_num = 0 with open(args.input_file, 'r') as result_handle, open(args.output_file, 'w') as fw: blast_records = NCBIXML.parse(result_handle) for i in proglog.start_message(): fw.write(i) fw.write('#\n') fw.write('# E-value threshold: ' + str(args.ev_thresh) + '\n') fw.write('# min hit number: ' + str(args.min_hit_num) + '\n') fw.write('#\n') fw.write('# filename query_accession,hit_accession_1,hit_accession_2, ...\n\n') fw.flush() gi = re.compile('gi\|(\d+)\|') for blast_record in blast_records: total_query_num += 1 if len(blast_record.alignments) < args.min_hit_num: continue hit_accs = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: if alignment.accession in blast_record.query: """If query hit itself, ignore it. """ continue if hsp.expect <= args.ev_thresh: match = gi.match(alignment.hit_id).group(1) if match is None: print(alignment.accession + ' does not have gi.') hit_accs.append(alignment.accession) else: hit_accs.append(match) break if len(hit_accs) >= args.min_hit_num: parsed_query_num += 1 fw.write(blast_record.query + '\t' + blast_record.query + ',') fw.write(','.join(hit_accs) + '\n') fw.flush() fw.write('\n') fw.write('# Total queries: ' + str(total_query_num) + '\n') fw.write('# Parsed queries: ' + str(parsed_query_num) + '\n') fw.write('#\n') for i in proglog.end_message(): fw.write(i) fw.flush()
def main(): proglog = logmsg.message(prog='commonfa', cmd=' '.join(sys.argv)) parser = argparse.ArgumentParser(description='commonfa - Generate fasta files of sequences with common hit') parser.add_argument('-b', '--blastlist', dest='input_files_blastlist', nargs='*', required=True, help='blastlist files (required)') parser.add_argument('-f', '--fasta', nargs='*', dest='input_files_fasta', required=True, help='fasta files (required)') parser.add_argument('-o', '--output-directory', dest='output', default='commonfa_out_' + name.genid(), help='output directory. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') parser.add_argument('-p', '--process', dest='process_num', type=int, default=1, help='number of threads (CPUs) to use') args = parser.parse_args() config = ConfigParser.ConfigParser() config.read(os.path.dirname(os.path.abspath(__file__)) + '/config/group.cfg') if not os.path.exists(args.output.rstrip('/') + '/msainput'): os.makedirs(args.output.rstrip('/') + '/msainput') fwlog = open(args.output.rstrip('.') + '/commonfa.log', 'w') for i in proglog.start_message(): fwlog.write(i) fwlog.flush() awk_cmd = "awk -F'\t' '$5 ~ /ref/ { print $0 }' " + ' '.join(args.input_files_blastlist) sort_cmd = "sort -t$'\t' -k5d,5 -k18g,18 -k22gr,22 -k19gr,19 -k26gr,26 -k6gr" fwsort = open(args.output.rstrip('/') + '/sort.temp', 'w') awk_proc = Popen(awk_cmd, stdout=PIPE, executable='/bin/bash', shell=True) sort_proc = Popen(sort_cmd, stdin=awk_proc.stdout, stdout=fwsort, executable='/bin/bash', shell=True) sort_proc.communicate() fwsort.close() fasta = {} for filename in args.input_files_fasta: fasta.update(dict(SeqIO.index(filename, 'fasta'))) susp_names = config.get('Susp', 'bdor').split(',') res_names = config.get('Res', 'bdor').split(',') rec_names = config.get('Rec', 'bdor').split(',') has_susp = has_res = has_rec = False commonhit = {} hitname = re.compile('.*gi\|\d*?\|(.*?)\|(.*?)\|.*') with open(args.output.rstrip('/') + '/sort.temp', 'r') as fin: for line in fin: data = line.split('\t') match = hitname.match(data[4]) query_name = data[3] hit_name = match.group(2) query_frame = int(data[9]) if hit_name in commonhit: if any(i in query_name for i in susp_names): if has_susp is True: continue else: has_susp = True if any(i in query_name for i in res_names): if has_res is True: continue else: has_res = True if any(i in query_name for i in rec_names): if has_rec is True: continue else: has_rec = True commonhit[hit_name].append((query_name, query_frame)) else: commonhit[hit_name] = [(query_name, query_frame)] has_susp = has_res = has_rec = False if any(i in query_name for i in susp_names): has_susp = True if any(i in query_name for i in res_names): has_res = True if any(i in query_name for i in rec_names): has_rec = True tasks = [] parsed_num = 0 for hit in commonhit: if len(commonhit[hit]) == len(args.input_files_blastlist): tasks.append((hit, commonhit[hit], fasta, args)) parsed_num += 1 pool = Pool(processes=args.process_num) pool.map(do_parsing, tasks) fwlog.write('# Parsed hits: ' + str(parsed_num) + '\n') for i in proglog.end_message(): fwlog.write(i) fwlog.flush()