def main(): input_database_basename = sys.argv[1] output_database_basename = sys.argv[2] input_data = ffindex.read_data(input_database_basename + ".ffdata") input_index = ffindex.read_index(input_database_basename + ".ffindex") fh = open(output_database_basename + ".cs219", "wb") total_length = 0 nr_sequences = len(input_index) line_break = bytearray("\n", "utf-8")[0] for entry in input_index: entry_data = ffindex.read_entry_data(entry, input_data) for i in range(len(entry_data)): if entry_data[i] == line_break: entry_data = entry_data[(i + 1):] break total_length += len(entry_data) fh.write(bytearray(">" + entry.name + "\n", "utf-8")) fh.write(entry_data) fh.close() fh = open(output_database_basename + ".cs219.sizes", "w") fh.write(str(nr_sequences) + " " + str(total_length)) fh.close()
def check_a3m_format(db_basename, force_mode): entries = ffindex.read_index(db_basename+"_a3m.ffindex") data = ffindex.read_data(db_basename+"_a3m.ffdata") corrupted_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) except: corrupted_alignments.add(entry.name) sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n") if len(corrupted_alignments) == 0: return if force_mode: tmp_dir = tempfile.mkdtemp() try: sys.stderr.write("WARNING: remove corrupted a3m's!\n") corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat") write_set_to_file(corrupted_alignments, corrupted_index_file) for suffix in ["a3m", "cs219", "hhm"]: remove_files_from_index(corrupted_index_file, db_basename+"_"+suffix+".ffindex") sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex") optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex") finally: shutil.rmtree(tmp_dir) else: sys.stderr.write("You may try to use the option --force to fix the database!\n")
def main(): import sys parser = arg() args = parser.parse_args(sys.argv[1:]) data = read_data(args.data) index = read_index(args.index) dataset = extract_data(data, index, args.ev, args.cov, args.sim, args.max_len) write_results(dataset, args.out_file)
def main(): data = ffindex.read_data(sys.argv[1]) entries = ffindex.read_index(sys.argv[2]) e = ffindex.get_entry_by_name('BAPNUNABA.a3m', entries) for entry in [e]: print(entry.name) if (entry.length == 1): print("skip: " + entry.name) continue entry_data = ffindex.read_entry_data(entry, data) alis = read_alignment_matrices(entry.length, entry_data) for ali in alis.alignments: print(ali.alignment_start_matrix)
def main(): data = ffindex.read_data(sys.argv[1]) entries = ffindex.read_index(sys.argv[2]) e = ffindex.get_entry_by_name('BAPNUNABA.a3m', entries) for entry in [e]: print(entry.name) if(entry.length == 1): print("skip: "+entry.name) continue entry_data = ffindex.read_entry_data(entry, data) alis = read_alignment_matrices(entry.length, entry_data) for ali in alis.alignments: print(ali.alignment_start_matrix)
def get_large_a3ms(a3m_base_path): entries = ffindex.read_index(a3m_base_path+"_a3m.ffindex") data = ffindex.read_data(a3m_base_path+"_a3m.ffindex") large_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) if alignment.get_number_sequences() > 50: large_alignments.add(entry.name) except: sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n") return large_alignments
def get_large_a3ms(a3m_base_path): entries = ffindex.read_index(a3m_base_path + "_a3m.ffindex") data = ffindex.read_data(a3m_base_path + "_a3m.ffindex") large_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) if alignment.get_number_sequences() > 50: large_alignments.add(entry.name) except: sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n") return large_alignments
def check_a3m_format(db_basename, force_mode): entries = ffindex.read_index(db_basename + "_a3m.ffindex") data = ffindex.read_data(db_basename + "_a3m.ffdata") corrupted_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) except: corrupted_alignments.add(entry.name) sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n") if len(corrupted_alignments) == 0: return if force_mode: tmp_dir = tempfile.mkdtemp() try: sys.stderr.write("WARNING: remove corrupted a3m's!\n") corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat") write_set_to_file(corrupted_alignments, corrupted_index_file) for suffix in ["a3m", "cs219", "hhm"]: remove_files_from_index( corrupted_index_file, db_basename + "_" + suffix + ".ffindex") sort_database(db_basename + "_" + suffix + ".ffdata", db_basename + "_" + suffix + ".ffindex") optimize_database(db_basename + "_" + suffix + ".ffdata", db_basename + "_" + suffix + ".ffindex") finally: shutil.rmtree(tmp_dir) else: sys.stderr.write( "You may try to use the option --force to fix the database!\n")
#!/usr/bin/env python import ffindex import sys input_file = sys.argv[1] output_file = sys.argv[2] entries = ffindex.read_index(input_file + ".ffindex") data = ffindex.read_data(input_file + ".ffdata") with open(output_file, "w") as fh: for entry in entries: size = int(ffindex.read_entry_data(entry, data).decode("utf-8")) if size < 51: fh.write(entry.name + "\n")
def main(): import sys parser = arg() args = parser.parse_args(sys.argv[1:]) tmp_dir = tempfile.mkdtemp() tmp = os.path.join(tmp_dir, tmp_dir.split("/")[-1]) tmp_graph = tmp + ".graph" if args.verbose: DEBUG = True entries = read_index(args.mat_index) data = read_data(args.mat_data) # load predictions predictions = read_predictions(args.predictions) # process list (input for map) block_size = args.block current_block = 1 pool = Pool(args.cpu) for block_start in range(0, len(entries), block_size): block_end = min(len(entries), block_start + block_size) block = entries[block_start:block_end] print ("Processing Block {cb} ({bs}-{be})".format(cb=current_block, bs=block_start, be=block_end)) plist = list() for entry in block: query = entry.name.split(".")[0] if (entry.length == 1): print ("skip: "+ entry.name) continue entry_data = read_entry_data(entry, data) alis = read_alignment_matrices(entry.length, entry_data) for ali in alis.alignments: template = ali.template #print (query + '->' + template) if query == template: print ("Skipping {0} (Self alignmnent).".format(template)) break prob = ali.alignment_probability / float(100) if prob < args.p_low: if DEBUG: print ("Not adding {q} - {t}. Prob(query, template) < P_low.".format(q=query, t=template)) break #if query == 'corrupt_mat' and template =='WEWKIWABA': # import pdb; pdb.set_trace() pred = dict() pred[query] = predictions[query] pred[template] = predictions[template] # check whether all matrices contain values if (len(ali.alignment_start_matrix) == 0) or (len(ali.alignment_end_matrix) == 0) or (len(ali.alignment_posterior_matrix) == 0): print ('! Warning detected empty matrices! Skipping '+ query + '->' + template) else: plist.append((AlignmentMatrix(query, template, prob, pred, ali.alignment_end_matrix, ali.alignment_start_matrix, ali.alignment_posterior_matrix), args.p_low, args.cov)) # Uncomment to debug! if args.verbose: processed_data = list() for process in plist: chunck = wrapper(process) processed_data.append(chunck) else: processed_data = pool.map(wrapper, plist) results = list(filter(lambda x: x != None, processed_data)) mode = "a" if block_start > 0 else "w" print ("Writing Block {cb}.".format(cb=current_block)) write_results(tmp_graph, results, mode) current_block += 1 graph = read_graph(tmp_graph) with open(args.output, 'w') as out_fh: for initial_node in graph: for terminal_node in graph[initial_node]: out_fh.write('{inn}\t{on}\t{prob}\n'.format(inn = initial_node, on = terminal_node, prob = graph[initial_node][terminal_node]))