f_os.write("contig,allele_name,position,len,mismatch_tag\n") print("Annotation only position:") for info in list_annotate_only_info: f_os.write(info + '\n') print(info) f_os.write("\n") else: f_os.write("All annotated places are covered by AIRRCall.\n\n") print("All annotated place are covered by AIRRCall.\n") if len(list_captured_only_info) > 0: f_os.write("AIRRCall only position:\n") f_os.write("contig,allele_name,position,len,mismatch_tag\n") print("AIRRCall only position:") for info in list_captured_only_info: f_os.write(info + '\n') print(info) f_os.write("\n") else: f_os.write("No AIRRCall only places.\n\n") print("No AIRRCall only places.\n") if fn_haplotypes: dict_haplotypes = parse_fasta(fn_haplotypes) pool_seqs = set(dict_haplotypes.keys()) f_os.write("Redundant AIRRCall seqs:\n") print("Redundant AIRRCall seqs:") for seq_name in sorted(pool_seqs - used_seqs): f_os.write(seq_name + '\n') print("\t" + seq_name) f_os.close()
fo_output_dir = args.fo_output_dir # load the pickle, allele and R1 R2 reads dict_allele_support_reads = read_pickle(fn_pickle_file) try: dict_allele_support_reads = { name.split('|')[1]: list_reads for name, list_reads in dict_allele_support_reads.items() } except: dict_allele_support_reads = { name.split()[0]: list_reads for name, list_reads in dict_allele_support_reads.items() } #print(dict_allele_support_reads.keys()) dict_allele = parse_fasta(fn_allele) try: dict_allele = { name.split('|')[1]: SEQ for name, SEQ in dict_allele.items() } except: dict_allele = { name.split()[0]: SEQ for name, SEQ in dict_allele.items() } #print(dict_allele.keys()) dict_read_1 = parse_fasta(fn_read_1) group_num = 0 if fn_read_2: dict_read_2 = parse_fasta(fn_read_2)
parser.add_argument('-fof', '--fo_asm_flanking', help='output flanking sequence fasta file from asm') args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() fn_asm_1 = args.fn_asm_1 fn_asm_2 = args.fn_asm_2 fn_annotation = args.fn_annotation len_extend = args.len_extend fo_asm_flanking = args.fo_asm_flanking dict_contig_H1 = parse_fasta(fn_asm_1) if fn_asm_2: dict_contig_H2 = parse_fasta(fn_asm_2) list_annotation = parse_annotation(fn_annotation) # dict_flank_SEQ {} # - keys: allele_name (novel) # - values: SEQ set {} dict_flank_SEQ = {} for annotation_info in list_annotation: allele_name = annotation_info[0] contig_name = annotation_info[1] start_pos = int(annotation_info[2]) end_pos = start_pos + int(annotation_info[3]) contig_SEQ = "" if fn_asm_2:
elif operate == ['M']: # read is including in the allele dict_allele_histogram[allele_name][start_pos - 1:end_pos - 1] += 1 dict_allele_reads[allele_name].add(read_name) if __name__ == '__main__': args = parse_args() fn_alleles = args.fn_alleles fn_sam = args.fn_sam thrsd = args.thrsd fo_calling_report = args.fo_calling_report fo_grouping_pickle = args.fo_grouping_pickle fn_verify_annotation = args.fn_verify_annotation dict_allele = parse_fasta(fn_alleles) dict_allele_histogram = { name.split()[0]: np.zeros(len(SEQ)) for name, SEQ in dict_allele.items() } dict_allele_reads = {name.split()[0]: set() for name in dict_allele.keys()} list_perfect_fields, list_mismatch_fields = parse_perfect_sam_with_S( fn_sam) histogram_read_depth(dict_allele_histogram, list_perfect_fields, dict_allele_reads, thrsd) list_min_depth = [[min(histogram), None, name.split('|')[1]] for name, histogram in dict_allele_histogram.items()]
def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) if __name__ == '__main__': args = parse_args() fn_list_alleles = args.fn_list_alleles merge_type = args.merge_type.lower() dict_novel_serial = {} list_novel_SEQ = None if merge_type == "reference": # dict_novel_serial {} # - keys: SEQ # - values: merged novel serial dict_temp = parse_fasta(args.fn_novel_reference) dict_novel_serial = {SEQ: name for (name, SEQ) in dict_temp.items()} list_novel_SEQ = sorted(dict_novel_serial.keys()) elif merge_type != "simple": print("WARNING! Incorrect Merge Type:", merge_type) fo_merged_fasta = args.fo_merged_fasta fo_merged_report = args.fo_merged_report print(fn_list_alleles) # dict_database {} # - keys: allele_name # - values: dict_SEQ {} # - keys: SEQ # - values: [person_name_0, person_name_1, ...] dict_database = {}
parser.add_argument('-fom', '--fo_merged_fasta', help='output merged fasta file') args = parser.parse_args() return args def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) if __name__ == '__main__': args = parse_args() fn_alleles = args.fn_alleles fn_novel = args.fn_novel fo_merged_fasta = args.fo_merged_fasta dict_original_allele = parse_fasta(fn_alleles) dict_novel_allele = parse_fasta(fn_novel) f_om = open(fo_merged_fasta, 'w') for name in sorted(dict_original_allele.keys()): f_om.write(">" + name.split()[0] + "\n") f_om.write(dict_original_allele[name] + "\n") for name in sorted(dict_novel_allele.keys()): if ("extend" in name) == False: f_om.write(">|" + name + "|\n") f_om.write(dict_novel_allele[name] + "\n") f_om.close()
dict_contig_region[contig_name] = (int(region.split('-')[0]), int(region.split('-')[1])) return dict_contig_region if __name__ == '__main__': args = parse_args() fn_sam = args.fn_sam fn_contig = args.fn_contig fn_allele_region = args.fn_allele_region len_extend = args.len_extend fo_flanking_haplotype = args.fo_flanking_haplotype fo_shrinked_contigs = args.fo_shrinked_contigs dict_contig_region = parse_allele_region(fn_allele_region) dict_cluster_contig = parse_fasta(fn_contig) f_oc = open(fo_shrinked_contigs, 'w') for contig_name, SEQ in sorted(dict_cluster_contig.items()): region = dict_contig_region[contig_name] cut_start = max(0, region[0] - len_extend) cut_end = min(len(SEQ), region[1] + len_extend) dict_contig_region[contig_name] = (cut_start, cut_end) shrinked_SEQ = SEQ[cut_start:cut_end] f_oc.write('>' + contig_name + '\n') f_oc.write(shrinked_SEQ + '\n') f_oc.close() dict_contig = cluster_separate(fn_contig, fn_sam) for contig_name, contig_info in sorted(dict_contig.items()): #parse the sam file and generate