Ejemplo n.º 1
0
def main():
    input_database_basename = sys.argv[1]
    output_database_basename = sys.argv[2]

    input_data = ffindex.read_data(input_database_basename + ".ffdata")
    input_index = ffindex.read_index(input_database_basename + ".ffindex")

    fh = open(output_database_basename + ".cs219", "wb")

    total_length = 0
    nr_sequences = len(input_index)
    line_break = bytearray("\n", "utf-8")[0]

    for entry in input_index:
        entry_data = ffindex.read_entry_data(entry, input_data)
        for i in range(len(entry_data)):
            if entry_data[i] == line_break:
                entry_data = entry_data[(i + 1):]
                break
        total_length += len(entry_data)
        fh.write(bytearray(">" + entry.name + "\n", "utf-8"))
        fh.write(entry_data)

    fh.close()

    fh = open(output_database_basename + ".cs219.sizes", "w")
    fh.write(str(nr_sequences) + " " + str(total_length))
    fh.close()
Ejemplo n.º 2
0
def check_a3m_format(db_basename, force_mode):
  entries = ffindex.read_index(db_basename+"_a3m.ffindex")
  data = ffindex.read_data(db_basename+"_a3m.ffdata")
  
  corrupted_alignments = set()
  for entry in entries:
    lines = ffindex.read_lines(entry, data)
    alignment = a3m.A3M_Container()
    try:
      alignment.read_a3m_from_lines(lines)
    except:
      corrupted_alignments.add(entry.name)
      sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")
  
  if len(corrupted_alignments) == 0:
    return
  
  if force_mode:
    tmp_dir = tempfile.mkdtemp()
    
    try:
      sys.stderr.write("WARNING: remove corrupted a3m's!\n")
      
      corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat")
      write_set_to_file(corrupted_alignments, corrupted_index_file)
      
      for suffix in ["a3m", "cs219", "hhm"]:
        remove_files_from_index(corrupted_index_file, db_basename+"_"+suffix+".ffindex")
        sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
        optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
    finally:
      shutil.rmtree(tmp_dir)
  else:
    sys.stderr.write("You may try to use the option --force to fix the database!\n")
def main():
    import sys
    parser = arg()
    args = parser.parse_args(sys.argv[1:])

    data = read_data(args.data)
    index = read_index(args.index)

    dataset = extract_data(data, index, args.ev, args.cov, args.sim, args.max_len)
    write_results(dataset, args.out_file)
def main():
    import sys
    parser = arg()
    args = parser.parse_args(sys.argv[1:])

    data = read_data(args.data)
    index = read_index(args.index)

    dataset = extract_data(data, index, args.ev, args.cov, args.sim,
                           args.max_len)
    write_results(dataset, args.out_file)
Ejemplo n.º 5
0
def main():
    data = ffindex.read_data(sys.argv[1])
    entries = ffindex.read_index(sys.argv[2])
    e = ffindex.get_entry_by_name('BAPNUNABA.a3m', entries)
    for entry in [e]:
        print(entry.name)
        if (entry.length == 1):
            print("skip: " + entry.name)
            continue
        entry_data = ffindex.read_entry_data(entry, data)
        alis = read_alignment_matrices(entry.length, entry_data)
        for ali in alis.alignments:
            print(ali.alignment_start_matrix)
Ejemplo n.º 6
0
def main():
    data = ffindex.read_data(sys.argv[1])
    entries = ffindex.read_index(sys.argv[2])
    e = ffindex.get_entry_by_name('BAPNUNABA.a3m', entries)
    for entry in [e]:
        print(entry.name)
        if(entry.length == 1):
            print("skip: "+entry.name)
            continue
        entry_data = ffindex.read_entry_data(entry, data)
        alis = read_alignment_matrices(entry.length, entry_data)
        for ali in alis.alignments:
            print(ali.alignment_start_matrix)
Ejemplo n.º 7
0
def get_large_a3ms(a3m_base_path):
  entries = ffindex.read_index(a3m_base_path+"_a3m.ffindex")
  data = ffindex.read_data(a3m_base_path+"_a3m.ffindex")
  
  large_alignments = set()
  for entry in entries:
    lines = ffindex.read_lines(entry, data)
    alignment = a3m.A3M_Container()
    try:
      alignment.read_a3m_from_lines(lines)
      
      if alignment.get_number_sequences() > 50:
        large_alignments.add(entry.name)
    except:
      sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")

  return large_alignments
def get_large_a3ms(a3m_base_path):
    entries = ffindex.read_index(a3m_base_path + "_a3m.ffindex")
    data = ffindex.read_data(a3m_base_path + "_a3m.ffindex")

    large_alignments = set()
    for entry in entries:
        lines = ffindex.read_lines(entry, data)
        alignment = a3m.A3M_Container()
        try:
            alignment.read_a3m_from_lines(lines)

            if alignment.get_number_sequences() > 50:
                large_alignments.add(entry.name)
        except:
            sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n")

    return large_alignments
Ejemplo n.º 9
0
def check_a3m_format(db_basename, force_mode):
    entries = ffindex.read_index(db_basename + "_a3m.ffindex")
    data = ffindex.read_data(db_basename + "_a3m.ffdata")

    corrupted_alignments = set()
    for entry in entries:
        lines = ffindex.read_lines(entry, data)
        alignment = a3m.A3M_Container()
        try:
            alignment.read_a3m_from_lines(lines)
        except:
            corrupted_alignments.add(entry.name)
            sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n")

    if len(corrupted_alignments) == 0:
        return

    if force_mode:
        tmp_dir = tempfile.mkdtemp()

        try:
            sys.stderr.write("WARNING: remove corrupted a3m's!\n")

            corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat")
            write_set_to_file(corrupted_alignments, corrupted_index_file)

            for suffix in ["a3m", "cs219", "hhm"]:
                remove_files_from_index(
                    corrupted_index_file,
                    db_basename + "_" + suffix + ".ffindex")
                sort_database(db_basename + "_" + suffix + ".ffdata",
                              db_basename + "_" + suffix + ".ffindex")
                optimize_database(db_basename + "_" + suffix + ".ffdata",
                                  db_basename + "_" + suffix + ".ffindex")
        finally:
            shutil.rmtree(tmp_dir)
    else:
        sys.stderr.write(
            "You may try to use the option --force to fix the database!\n")
#!/usr/bin/env python

import ffindex
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

entries = ffindex.read_index(input_file + ".ffindex")
data = ffindex.read_data(input_file + ".ffdata")

with open(output_file, "w") as fh:
    for entry in entries:
        size = int(ffindex.read_entry_data(entry, data).decode("utf-8"))
        if size < 51:
            fh.write(entry.name + "\n")
Ejemplo n.º 11
0
def main():
    import sys
    parser = arg()
    args = parser.parse_args(sys.argv[1:])
    
    tmp_dir = tempfile.mkdtemp()
    tmp = os.path.join(tmp_dir, tmp_dir.split("/")[-1])
    tmp_graph = tmp + ".graph"

    if args.verbose:
        DEBUG = True

    entries = read_index(args.mat_index)
    data = read_data(args.mat_data)
    
    # load predictions 
    predictions = read_predictions(args.predictions)
    # process list (input for map)

    block_size = args.block
    current_block = 1
    pool = Pool(args.cpu)

    for block_start in range(0, len(entries), block_size):
        block_end = min(len(entries), block_start + block_size)
        block = entries[block_start:block_end]

        print ("Processing Block {cb} ({bs}-{be})".format(cb=current_block, bs=block_start, be=block_end))

        plist = list()
      
        for entry in block:
            query = entry.name.split(".")[0]

            if (entry.length == 1):
                print ("skip: "+ entry.name)
                continue

            entry_data = read_entry_data(entry, data)
            alis = read_alignment_matrices(entry.length, entry_data)

            for ali in alis.alignments:
                template = ali.template
                #print (query + '->' + template)
                
                if query == template:
                    print ("Skipping {0} (Self alignmnent).".format(template))
                    break
                
                prob = ali.alignment_probability / float(100)

                if prob < args.p_low:
                    if DEBUG:
                        print ("Not adding {q} - {t}. Prob(query, template) < P_low.".format(q=query, t=template))
                    break
                
                #if query == 'corrupt_mat' and template =='WEWKIWABA':
                #    import pdb; pdb.set_trace()
               

                pred = dict()
                pred[query] = predictions[query]
                pred[template] = predictions[template]
                
                # check whether all matrices contain values
                if (len(ali.alignment_start_matrix) == 0) or (len(ali.alignment_end_matrix) == 0) or (len(ali.alignment_posterior_matrix) == 0):
                    print ('! Warning detected empty matrices! Skipping '+ query + '->' + template)
                else:                
                    plist.append((AlignmentMatrix(query, template, prob, pred, ali.alignment_end_matrix, ali.alignment_start_matrix, ali.alignment_posterior_matrix), args.p_low, args.cov))
        
        # Uncomment to debug! 
        
        if args.verbose:
            processed_data = list()
            for process in plist:
                chunck = wrapper(process)
                processed_data.append(chunck)
        else:
            processed_data = pool.map(wrapper, plist)
        
        results = list(filter(lambda x: x != None, processed_data))
        
        mode = "a" if block_start > 0 else "w"
        print ("Writing Block {cb}.".format(cb=current_block))
        write_results(tmp_graph, results, mode)
        current_block += 1

    graph = read_graph(tmp_graph)

    with open(args.output, 'w') as out_fh:
        for initial_node in graph:
            for terminal_node in graph[initial_node]:
                out_fh.write('{inn}\t{on}\t{prob}\n'.format(inn = initial_node, on = terminal_node, prob = graph[initial_node][terminal_node]))