def main(): # Check version if sys.version < '3.3': your_version = sys.version.split(' ')[0] print('* Your Python version (%s) is too old! Please upgrade to 3.3+!' % your_version) sys.exit() proglog = logmsg.message(prog='msaparser', cmd=' '.join(sys.argv)) options, opt_others = config.get_configuration(os.path.dirname(os.path.abspath(__file__))) options.output_directory = options.output_directory.rstrip('/') options.source_directory = options.source_directory.rstrip('/') if not os.path.exists(options.output_directory + '/html'): os.makedirs(options.output_directory + '/html') mainfile = options.output_directory + '/' + opt_others.get('output_files').get('main') with open(mainfile, 'w') as fw: for msg in proglog.start_message(): fw.write(msg) fw.write('\n') fw.flush() parser.writeheader(fw) cluinput = msaio.FileInput(options.source_directory) proc_manager = Manager() q_write = proc_manager.Queue() proc = pool.Pool(processes=options.process_num) while cluinput.files: if len(cluinput.files) / 100 >= 1: files = cluinput.files[0:100] cluinput.files = cluinput.files[100:] else: files = cluinput.files[0:len(cluinput.files)] cluinput.files = cluinput.files[len(cluinput.files):] tasks = [] for root, filename in files: tasks.append((root, filename, options, opt_others, q_write)) proc.starmap(begin_parse, tasks) write_result(q_write, options.output_directory, mainfile) with open(mainfile, 'a') as fw: fw.write('\n') for msg in proglog.end_message(): fw.write(msg) fw.flush()
def main(): proglog = logmsg.message(prog='fetchfa', cmd=' '.join(sys.argv)) parser = argparse.ArgumentParser(description='fetchfa - Fetch fasta files from Entrez') parser.add_argument('input_file', nargs='?') parser.add_argument('-d', '--db', dest='database', default='protein', help='database (default: protein)') parser.add_argument('-q', '--query', dest='query_id', help='accessions to be fetched. If this option is specifid, the script will use the values ' 'to fetch data, and no input file is required to be handled.') parser.add_argument('-o', '--output', dest='output', default='fetchfa_out_' + name.genid(), help='output directory or file name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') parser.add_argument('-l', '--log', dest='log_file', help='log file name') args = parser.parse_args() if args.log_file is None: fwlog = open(args.output + '.log', 'w') else: fwlog = open(args.log_file, 'w') for i in proglog.start_message(): fwlog.write(i) fwlog.flush() Entrez.email = name.genid() + '@example.com' if args.query_id is not None: with open(args.output + '.fa', 'w') as fw, open(args.output + '.log', 'w') as fwlog: handle = Entrez.efetch(db=args.database, id=args.query_id, rettype='fasta', retmode='text') fw.write(handle.read()) fw.flush() fwlog.write('# Fetched sequences: ' + str(len(args.query_id.split(','))) + '\n') fwlog.write('#\n') for i in proglog.end_message(): fwlog.write(i) fwlog.flush() else: if not os.path.exists(args.output): os.makedirs(args.output) with open(args.input_file, 'r') as fin: query_num = 0 for line in fin: if line.lstrip() == '' or line.lstrip()[0] in ('#', 'a'): continue query_num += 1 with open(os.path.abspath(args.output) + '/' + line.split('\t')[0] + '.fa', 'w') as fw: alist = line.rstrip().split('\t')[1].split(',') while len(alist) > 30: alist_part = alist[0:30] alist = alist[30:len(alist)] handle = Entrez.efetch(db=args.database, id=','.join(alist_part), rettype='fasta', retmode='text') fw.write(handle.read()) fw.flush() handle.close() handle = Entrez.efetch(db=args.database, id=','.join(alist), rettype='fasta', retmode='text') fw.write(handle.read()) fw.flush() handle.close() fwlog.write('# Fetched queries: ' + str(query_num) + '\n') fwlog.write('#\n') for i in proglog.end_message(): fwlog.write(i) fwlog.flush() fwlog.close()
def main(): proglog = logmsg.message(prog='blast2accmap', cmd=' '.join(sys.argv)) parser = argparse.ArgumentParser(description='blast2accmap - Extract names of query and hit sequences') parser.add_argument('input_file') parser.add_argument('-e', '--evalue', dest='ev_thresh', type=float, default=0.01, help='evalue thresh (default: 0.01)') parser.add_argument('-t', '--min_hit_num', dest='min_hit_num', type=int, default=1, help='minimum number of hit sequences (default: 1)') parser.add_argument('-o', '--output', dest='output_file', help='output file name. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') args = parser.parse_args() if args.output_file is None: args.output_file = args.input_file + '_out_' + name.genid() + 'blastaccmap' total_query_num = 0 parsed_query_num = 0 with open(args.input_file, 'r') as result_handle, open(args.output_file, 'w') as fw: blast_records = NCBIXML.parse(result_handle) for i in proglog.start_message(): fw.write(i) fw.write('#\n') fw.write('# E-value threshold: ' + str(args.ev_thresh) + '\n') fw.write('# min hit number: ' + str(args.min_hit_num) + '\n') fw.write('#\n') fw.write('# filename query_accession,hit_accession_1,hit_accession_2, ...\n\n') fw.flush() gi = re.compile('gi\|(\d+)\|') for blast_record in blast_records: total_query_num += 1 if len(blast_record.alignments) < args.min_hit_num: continue hit_accs = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: if alignment.accession in blast_record.query: """If query hit itself, ignore it. """ continue if hsp.expect <= args.ev_thresh: match = gi.match(alignment.hit_id).group(1) if match is None: print(alignment.accession + ' does not have gi.') hit_accs.append(alignment.accession) else: hit_accs.append(match) break if len(hit_accs) >= args.min_hit_num: parsed_query_num += 1 fw.write(blast_record.query + '\t' + blast_record.query + ',') fw.write(','.join(hit_accs) + '\n') fw.flush() fw.write('\n') fw.write('# Total queries: ' + str(total_query_num) + '\n') fw.write('# Parsed queries: ' + str(parsed_query_num) + '\n') fw.write('#\n') for i in proglog.end_message(): fw.write(i) fw.flush()
def main(): proglog = logmsg.message(prog='commonfa', cmd=' '.join(sys.argv)) parser = argparse.ArgumentParser(description='commonfa - Generate fasta files of sequences with common hit') parser.add_argument('-b', '--blastlist', dest='input_files_blastlist', nargs='*', required=True, help='blastlist files (required)') parser.add_argument('-f', '--fasta', nargs='*', dest='input_files_fasta', required=True, help='fasta files (required)') parser.add_argument('-o', '--output-directory', dest='output', default='commonfa_out_' + name.genid(), help='output directory. If this option is not specified, the script will generate ' 'one with unique identifier at current directory.') parser.add_argument('-p', '--process', dest='process_num', type=int, default=1, help='number of threads (CPUs) to use') args = parser.parse_args() config = ConfigParser.ConfigParser() config.read(os.path.dirname(os.path.abspath(__file__)) + '/config/group.cfg') if not os.path.exists(args.output.rstrip('/') + '/msainput'): os.makedirs(args.output.rstrip('/') + '/msainput') fwlog = open(args.output.rstrip('.') + '/commonfa.log', 'w') for i in proglog.start_message(): fwlog.write(i) fwlog.flush() awk_cmd = "awk -F'\t' '$5 ~ /ref/ { print $0 }' " + ' '.join(args.input_files_blastlist) sort_cmd = "sort -t$'\t' -k5d,5 -k18g,18 -k22gr,22 -k19gr,19 -k26gr,26 -k6gr" fwsort = open(args.output.rstrip('/') + '/sort.temp', 'w') awk_proc = Popen(awk_cmd, stdout=PIPE, executable='/bin/bash', shell=True) sort_proc = Popen(sort_cmd, stdin=awk_proc.stdout, stdout=fwsort, executable='/bin/bash', shell=True) sort_proc.communicate() fwsort.close() fasta = {} for filename in args.input_files_fasta: fasta.update(dict(SeqIO.index(filename, 'fasta'))) susp_names = config.get('Susp', 'bdor').split(',') res_names = config.get('Res', 'bdor').split(',') rec_names = config.get('Rec', 'bdor').split(',') has_susp = has_res = has_rec = False commonhit = {} hitname = re.compile('.*gi\|\d*?\|(.*?)\|(.*?)\|.*') with open(args.output.rstrip('/') + '/sort.temp', 'r') as fin: for line in fin: data = line.split('\t') match = hitname.match(data[4]) query_name = data[3] hit_name = match.group(2) query_frame = int(data[9]) if hit_name in commonhit: if any(i in query_name for i in susp_names): if has_susp is True: continue else: has_susp = True if any(i in query_name for i in res_names): if has_res is True: continue else: has_res = True if any(i in query_name for i in rec_names): if has_rec is True: continue else: has_rec = True commonhit[hit_name].append((query_name, query_frame)) else: commonhit[hit_name] = [(query_name, query_frame)] has_susp = has_res = has_rec = False if any(i in query_name for i in susp_names): has_susp = True if any(i in query_name for i in res_names): has_res = True if any(i in query_name for i in rec_names): has_rec = True tasks = [] parsed_num = 0 for hit in commonhit: if len(commonhit[hit]) == len(args.input_files_blastlist): tasks.append((hit, commonhit[hit], fasta, args)) parsed_num += 1 pool = Pool(processes=args.process_num) pool.map(do_parsing, tasks) fwlog.write('# Parsed hits: ' + str(parsed_num) + '\n') for i in proglog.end_message(): fwlog.write(i) fwlog.flush()