def resolve_overlapping_homologues(all_aligned_segs, lifted_feature_list, features_to_remap, unmapped_features, threshold, feature_hierarchy, feature_db, ref_parent_order, seq_id_threshold, feature_locations, distance_factor, max_overlap): iter = 0 max_iter = 10 * len(features_to_remap) while len(features_to_remap) > 0: iter += 1 if iter > max_iter: break aligned_segs_for_remap = remove_features_and_get_alignments( features_to_remap, lifted_feature_list, all_aligned_segs) lift_features.lift_all_features(aligned_segs_for_remap, threshold, feature_db, feature_hierarchy.parents, feature_hierarchy, unmapped_features, lifted_feature_list, seq_id_threshold, feature_locations, distance_factor, ref_parent_order) features_to_check = get_successfully_remapped_features( lifted_feature_list, features_to_remap, ) features_to_remap, feature_locations = check_homologues( lifted_feature_list, features_to_check, feature_hierarchy.parents, ref_parent_order, max_overlap) remove_unresolved_features(features_to_remap, feature_hierarchy.parents, lifted_feature_list, unmapped_features) return lifted_feature_list
def find_and_polish_broken_cds(args, lifted_feature_list,feature_hierarchy, ref_chroms, target_chroms, unmapped_features, feature_db, ref_parent_order,): args.subcommand = "polish" polish_lifted_features = {} ref_fa, target_fa = Fasta(args.reference), Fasta(args.target) for target_feature in lifted_feature_list: aligned_segments_new = {} if polish.polish_annotations(lifted_feature_list, ref_fa, target_fa, args, feature_hierarchy, target_feature): aligned_segments = align_features.align_features_to_target(ref_chroms, target_chroms, args, feature_hierarchy, "chrm_by_chrm", unmapped_features) aligned_segments_new[target_feature] = list(aligned_segments.values())[0] for seg in aligned_segments_new[target_feature]: seg.query_name = target_feature args.d = 100000000 lift_features.lift_all_features(aligned_segments_new, args.a, feature_db, feature_hierarchy, unmapped_features, polish_lifted_features, args.s, None, args, ref_parent_order) check_cds(polish_lifted_features, feature_hierarchy, args) for feature in polish_lifted_features: original_feature = lifted_feature_list[feature][0] polished_feature = polish_lifted_features[feature][0] replace = False if 'valid_ORFs' not in polished_feature.attributes or int(polished_feature.attributes['valid_ORFs'][0]) > \ int(original_feature.attributes['valid_ORFs'][0]): replace = True elif polished_feature.attributes['valid_ORFs'][0] == original_feature.attributes['valid_ORFs'][0]: if polished_feature.attributes['sequence_ID'][0] > original_feature.attributes['sequence_ID'][0]: replace = True elif polished_feature.attributes['coverage'][0] > original_feature.attributes['coverage'][0]: replace = True if replace: lifted_feature_list[feature] = polish_lifted_features[feature]
def map_extra_copies(target_fasta, reference_fasta, ref_chroms, target_chroms, processes, lifted_feature_list, parent_dict, children_dict, feature_db, intermediate_dict, parent_order, seq_threshold, minimap2_path, inter_files, remap, max_alns): liftoff_utils.clear_scores(lifted_feature_list, parent_dict) unmapped_features = [] liftover_type = "copies" extract_features.get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type) aligned_segments = align_features.align_features_to_target( ref_chroms, target_chroms, processes, target_fasta, parent_dict, children_dict, liftover_type, unmapped_features, reference_fasta, minimap2_path, inter_files, remap, max_alns) print("lifting features") lift_features.lift_all_features(aligned_segments, {}, 0.0, feature_db, parent_dict, children_dict, intermediate_dict, unmapped_features, lifted_feature_list, seq_threshold) fix_overlapping_features.fix_incorrectly_overlapping_features( lifted_feature_list, lifted_feature_list, parent_dict, aligned_segments, unmapped_features, 0.0, intermediate_dict, children_dict, feature_db, parent_order, seq_threshold)
def map_unplaced_genes(unmapped_features, target_fasta, reference_fasta, ref_chroms, target_chroms, processes, lifted_feature_list, feature_db, parent_dict, intermediate_dict, children_dict, parent_order, minimap2_path, inter_files, max_alns): liftoff_utils.clear_scores(lifted_feature_list, parent_dict) liftover_type = "unplaced" unplaced_dict = {} for feature_name in parent_dict: feature = parent_dict[feature_name] if feature.seqid in ref_chroms: unplaced_dict[feature.id] = feature extract_features.get_gene_sequences(unplaced_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type) aligned_segments = align_features.align_features_to_target( ref_chroms, target_chroms, processes, target_fasta, unplaced_dict, children_dict, liftover_type, unmapped_features, reference_fasta, minimap2_path, inter_files, True, max_alns) print("lifting features") lift_features.lift_all_features(aligned_segments, {}, 0.0, feature_db, unplaced_dict, children_dict, intermediate_dict, unmapped_features, lifted_feature_list, 0.0) fix_overlapping_features.fix_incorrectly_overlapping_features( lifted_feature_list, lifted_feature_list, parent_dict, aligned_segments, unmapped_features, 0.0, intermediate_dict, children_dict, feature_db, parent_order, 0.0)
def lift_original_annotation(gff, target_fasta, reference_fasta, ref_chroms, target_chroms, processes, db, lifted_feature_list, unmapped_features, infer_transcripts, infer_genes, cov_threshold, seq_threshold, minimap2_path, inter_files, max_alns, parents_to_lift): liftover_type = "chrm_by_chrm" if target_chroms[0] == target_fasta: cov_threshold, seq_threshold = 0, 0 parent_dict, children_dict, intermediate_dict, feature_db, original_parent_order = extract_features.extract_features_to_lift( gff, db, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files, liftover_type, parents_to_lift) aligned_segments = align_features.align_features_to_target( ref_chroms, target_chroms, processes, target_fasta, parent_dict, children_dict, liftover_type, unmapped_features, reference_fasta, minimap2_path, inter_files, True, max_alns) print("lifting features") lift_features.lift_all_features(aligned_segments, {}, cov_threshold, feature_db, parent_dict, children_dict, intermediate_dict, unmapped_features, lifted_feature_list, seq_threshold) fix_overlapping_features.fix_incorrectly_overlapping_features( lifted_feature_list, lifted_feature_list, parent_dict, aligned_segments, unmapped_features, cov_threshold, intermediate_dict, children_dict, feature_db, original_parent_order, seq_threshold) return feature_db, parent_dict, intermediate_dict, children_dict, original_parent_order
def align_and_lift_features(ref_chroms, target_chroms, args, feature_hierarchy, liftover_type, unmapped_features, feature_db, features_to_lift, lifted_features_list, ref_parent_order, min_cov, min_seqid): aligned_segments = align_features.align_features_to_target(ref_chroms, target_chroms, args, feature_hierarchy, liftover_type, unmapped_features) print("lifting features") feature_locations = None lift_features.lift_all_features(aligned_segments, min_cov, feature_db, features_to_lift, feature_hierarchy, unmapped_features, lifted_features_list, min_seqid, feature_locations, args.d) fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_features_list, lifted_features_list, aligned_segments, unmapped_features, min_cov, feature_hierarchy, feature_db, ref_parent_order, min_seqid, args.d)
def resolve_overlapping_homologues(all_aligned_segs, lifted_feature_list, features_to_remap, unmapped_features, threshold, parent_dict, intermediate_dict, children_dict, feature_db, original_parent_order, seq_id_threshold): all_overlapping_features = {} starting_remap_feature_num = len(features_to_remap) iter = 0 while len(features_to_remap) > 0: iter += 1 if iter > 10 * starting_remap_feature_num: break features_to_check = {} aligned_segs_to_remap = {} for feature_to_remap in features_to_remap: del lifted_feature_list[feature_to_remap] aligned_segs_to_remap[feature_to_remap] = all_aligned_segs[ feature_to_remap] add_overlapping_feature(features_to_remap, feature_to_remap, all_overlapping_features) lift_features.lift_all_features(aligned_segs_to_remap, all_overlapping_features, threshold, feature_db, parent_dict, children_dict, intermediate_dict, unmapped_features, lifted_feature_list, seq_id_threshold) clean_overlapping_features(lifted_feature_list, all_overlapping_features, parent_dict, features_to_remap, unmapped_features) for feature_to_remap in features_to_remap: if feature_to_remap in lifted_feature_list: features_to_check[feature_to_remap] = lifted_feature_list[ feature_to_remap] features_to_remap = check_homologues(lifted_feature_list, features_to_check, parent_dict, original_parent_order) for feature in features_to_remap: unmapped_features.append( parent_dict[liftoff_utils.convert_id_to_original(feature)]) del lifted_feature_list[feature] return lifted_feature_list