def generate_subfeatures(parent, window_size, other, protein=False): log.debug("Generating subfeatures for %s %s %s", parent, window_size, other) # We strip off trailing mismatches, and add a stop codon. # # The trailing mismatches cause blocks to run past the end where we don't # want them (and violate the gff3 spec). # # The stop codon will never match (and will slightly decrease %ident) but # it makes the last block the same length as the others which looks nicer. ps = parent["seq"].rstrip("-") + "*" for i in range(0, len(ps), window_size): block_seq = ps[i : i + window_size] real_window_size = len(block_seq) real_start = abs(parent["start"]) # This is the index against the parent feature start/end region where our current comparison block is # # In order to handle cases where the parent sequence has a bunch of # ----- leading, we want to find the "real" start of the current block. # So we take the current block start (i), and subtract the number of -s # # Then convert to nucl. nucl2prot = 3 indexed_start = nucl2prot * (i - ps[0:i].count("-")) log.debug( " I: %s, BS: %s, RWS: %s, RS: %s, IS: %s, C: %s", i, block_seq, real_window_size, real_start, indexed_start, ps[0:i].count("-"), ) if parent["start"] < 0: strand = -1 else: strand = 1 pid = percent_identity(block_seq, other["seq"][i : i + real_window_size]) # Ignore 0% identity sequences if pid == 0: continue yield SeqFeature( FeatureLocation( real_start + indexed_start, real_start + indexed_start + nucl2prot * real_window_size, ), type="match_part", strand=strand, qualifiers={ "source": "progressiveMauve", "score": pid, # 'alignment': block_seq + '<br/>' + other['seq'][i:i + real_window_size], "qseq": block_seq, "sseq": other["seq"][i : i + real_window_size], }, )
def convert_xmfa_to_gff3(xmfa_file, fasta_genomes, window_size=3, relative_to="1"): label_convert = _id_tn_dict(fasta_genomes) try: os.makedirs("out") except Exception: pass for lcb_idx, lcb in enumerate(parse_xmfa(xmfa_file)): ids = [seq["id"] for seq in lcb] # Doesn't match part of our sequence if relative_to not in ids: continue # Skip sequences that are JUST our "relative_to" genome if len(ids) == 1: continue parent = [seq for seq in lcb if seq["id"] == relative_to][0] others = [seq for seq in lcb if seq["id"] != relative_to] if parent["start"] == 0 and parent["end"] == 0: continue corrected_parent, corrected_targets = remove_gaps( parent["seq"], [other["seq"] for other in others]) # Update the parent/others with corrected sequences parent["corrected"] = corrected_parent for i, target in enumerate(corrected_targets): others[i]["corrected"] = target for i in range(1, len(corrected_parent) - 1): for other in others: left_bound = max(0, i - window_size) right_bound = i + window_size point_pid = percent_identity( parent["corrected"][left_bound:right_bound], other["corrected"][left_bound:right_bound], ) label_convert[other["id"]]["temp"].write( "%s\t%s\n" % (abs(parent["start"]) + i, point_pid)) for key in label_convert.keys(): # Ignore self-self if key == relative_to: continue other = label_convert[key] other["temp"].close() sizes = [(label_convert[relative_to]["record_id"], label_convert[relative_to]["len"])] bw_file = os.path.join("out", secure_filename(other["record_id"] + ".bigwig")) convert_to_bigwig(label_convert[key]["temp"].name, sizes, bw_file)
def total_similarity(xmfa_file, sequences=None, dice=False): if sequences is None: raise Exception("Must provide a non-zero number of sequence files") label_convert = _id_tn_dict(sequences) lcbs = parse_xmfa(xmfa_file) # make a matrix based on number of sequences table = {} for lcb in lcbs: # ignore LCBs containing only one sequence if len(lcb) == 0: continue # permutations based on num sequences to compare for current LCB compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2)) for permutation in compare_seqs: (i, j) = permutation similarity = percent_identity(lcb[i]['seq'], lcb[j]['seq']) i_name = label_convert[lcb[i]['id']]['id'] j_name = label_convert[lcb[j]['id']]['id'] # find length of sequence in LCB length_seq_lcb = lcb[i]['end'] - (lcb[i]['start'] - 1) # populate table with normalized similarity value based on length_seq_lcb if (i_name, j_name) not in table: table[(i_name, j_name)] = 0 table[(i_name, j_name)] += length_seq_lcb * similarity # finalize total percent similarity by dividing by length of parent sequence for i in label_convert.keys(): for j in label_convert.keys(): i_name = label_convert[i]['id'] j_name = label_convert[j]['id'] if (i_name, j_name) in table: if dice: table[(i_name, j_name)] = 2 * table[(i_name, j_name)] / ( label_convert[i]['len'] + label_convert[j]['len']) else: table[(i_name, j_name)] = table[(i_name, j_name)] / label_convert[i]['len'] else: table[(i_name, j_name)] = 0 if i_name == j_name: table[(i_name, j_name)] = 100 # print table names = [] table_keys = sorted(label_convert.keys()) for i in table_keys: names.append(label_convert[i]['id']) sys.stdout.write('\t' + '\t'.join(names) + '\n') for j in table_keys: j_key = label_convert[j]['id'] sys.stdout.write(j_key) for i in table_keys: i_key = label_convert[i]['id'] sys.stdout.write('\t%0.2f' % table[(i_key, j_key)]) sys.stdout.write('\n')