예제 #1
0
    def seg_count_file(self):
        """
        This function parses the tab delimited SegCopy file into a complex dictionary.
        :return:
        """

        prior_ploidy = {
        }  # This is essentially a tracking dictionary that I make here because the keys are available.
        bin_tracking_dict = Tool_Box.VivifiedDictionary()
        line_num = 0
        seg_copy_array = self.array_builder()

        seg_count = list(csv.reader(open(self.input_file), delimiter='\t'))

        for line in seg_count:
            if line_num > 0:
                bin_tracking_dict[line[0]][line_num] = (line[1], line[2])

            elif line_num == 0:  # First line is the header.
                label_list = line
                for i in range(len(label_list)):
                    if i > 2:
                        prior_ploidy[label_list[i]] = [-1, False, 0, 0, 0]
            line_num += 1

        if not eval(self.chrY):
            with suppress(KeyError):
                bin_tracking_dict.pop("chrY")

        return prior_ploidy, bin_tracking_dict, seg_copy_array
예제 #2
0
    def quality_check(data_bundle, fastq_files):
        """
        Called by the multiprocessor pool.  Examines the indices and determines the mismatches and N counts.

        :param data_bundle:
        :param fastq_files:
        :return:
        """

        prog_check = data_bundle[0]
        index_list = data_bundle[1]
        file1_anchor_seq = data_bundle[2]
        file2_anchor_seq = data_bundle[3]
        fastq1 = FASTQ_Reader(fastq_files[0])
        fastq2 = FASTQ_Reader(fastq_files[1])

        umt_dict = collections.defaultdict(
            lambda: collections.defaultdict(int))
        anchor_dict = Tool_Box.VivifiedDictionary()
        read_count = 0

        try:
            while True:
                fastq1_read = next(fastq1.seq_read())
                fastq2_read = next(fastq2.seq_read())
                read_count += 1

                if read_count % int(prog_check) == 0:
                    print("      -->Processed {0} reads in file {1} and {2}.".
                          format(read_count, fastq_files[0], fastq_files[1]))

                # Get read index and UMT.
                umt = "{0}{1}".format(
                    fastq1_read.name.split("|")[0],
                    fastq2_read.name.split("|")[1].split(":")[0])
                read_index = fastq1_read.name.split(":")[-1]

                # Quantify anchor lengths.
                unknown_anchor1 = fastq1_read.seq[7:18]
                unknown_anchor2 = fastq2_read.seq[7:18]
                match1 = Levenshtein.distance(file1_anchor_seq,
                                              unknown_anchor1)
                match2 = Levenshtein.distance(file2_anchor_seq,
                                              unknown_anchor2)

                for index in index_list:
                    index_match = Levenshtein.distance(read_index,
                                                       index[0][:6])

                    # Add anchor and UMT data to dictionaries.
                    if index[0] in anchor_dict and index_match < 2:
                        anchor_dict[index[0]]["R1"][match1] += 1
                        anchor_dict[index[0]]["R2"][match2] += 1
                        umt_dict[index[0]][umt] += 1
                        # if umt in umt_dict[index[0]]:
                        #     umt_dict[index[0]][umt] += 1
                        # else:
                        #     umt_dict[index[0]][umt] = 1

                    elif index_match < 2:
                        anchor_dict[
                            index[0]]["R1"] = [0] * len(file1_anchor_seq)
                        anchor_dict[
                            index[0]]["R2"] = [0] * len(file2_anchor_seq)
                        anchor_dict[index[0]]["R1"][match1] += 1
                        anchor_dict[index[0]]["R2"][match2] += 1
                        umt_dict[index[0]][umt] = 1
        except StopIteration:
            return anchor_dict, umt_dict