def seg_count_file(self): """ This function parses the tab delimited SegCopy file into a complex dictionary. :return: """ prior_ploidy = { } # This is essentially a tracking dictionary that I make here because the keys are available. bin_tracking_dict = Tool_Box.VivifiedDictionary() line_num = 0 seg_copy_array = self.array_builder() seg_count = list(csv.reader(open(self.input_file), delimiter='\t')) for line in seg_count: if line_num > 0: bin_tracking_dict[line[0]][line_num] = (line[1], line[2]) elif line_num == 0: # First line is the header. label_list = line for i in range(len(label_list)): if i > 2: prior_ploidy[label_list[i]] = [-1, False, 0, 0, 0] line_num += 1 if not eval(self.chrY): with suppress(KeyError): bin_tracking_dict.pop("chrY") return prior_ploidy, bin_tracking_dict, seg_copy_array
def quality_check(data_bundle, fastq_files): """ Called by the multiprocessor pool. Examines the indices and determines the mismatches and N counts. :param data_bundle: :param fastq_files: :return: """ prog_check = data_bundle[0] index_list = data_bundle[1] file1_anchor_seq = data_bundle[2] file2_anchor_seq = data_bundle[3] fastq1 = FASTQ_Reader(fastq_files[0]) fastq2 = FASTQ_Reader(fastq_files[1]) umt_dict = collections.defaultdict( lambda: collections.defaultdict(int)) anchor_dict = Tool_Box.VivifiedDictionary() read_count = 0 try: while True: fastq1_read = next(fastq1.seq_read()) fastq2_read = next(fastq2.seq_read()) read_count += 1 if read_count % int(prog_check) == 0: print(" -->Processed {0} reads in file {1} and {2}.". format(read_count, fastq_files[0], fastq_files[1])) # Get read index and UMT. umt = "{0}{1}".format( fastq1_read.name.split("|")[0], fastq2_read.name.split("|")[1].split(":")[0]) read_index = fastq1_read.name.split(":")[-1] # Quantify anchor lengths. unknown_anchor1 = fastq1_read.seq[7:18] unknown_anchor2 = fastq2_read.seq[7:18] match1 = Levenshtein.distance(file1_anchor_seq, unknown_anchor1) match2 = Levenshtein.distance(file2_anchor_seq, unknown_anchor2) for index in index_list: index_match = Levenshtein.distance(read_index, index[0][:6]) # Add anchor and UMT data to dictionaries. if index[0] in anchor_dict and index_match < 2: anchor_dict[index[0]]["R1"][match1] += 1 anchor_dict[index[0]]["R2"][match2] += 1 umt_dict[index[0]][umt] += 1 # if umt in umt_dict[index[0]]: # umt_dict[index[0]][umt] += 1 # else: # umt_dict[index[0]][umt] = 1 elif index_match < 2: anchor_dict[ index[0]]["R1"] = [0] * len(file1_anchor_seq) anchor_dict[ index[0]]["R2"] = [0] * len(file2_anchor_seq) anchor_dict[index[0]]["R1"][match1] += 1 anchor_dict[index[0]]["R2"][match2] += 1 umt_dict[index[0]][umt] = 1 except StopIteration: return anchor_dict, umt_dict