Beispiel #1
0
    def get_coordinates(self, window_size, coverage_column_name: int,
                        window_column_name: int,
                        repeat_window_number: int) -> list:
        coordinates = []
        median_between_regions_list = []
        repeat_window = 0
        start_coordinate = None

        between_regions_coverage_dict = Counter()
        between_region_flag = None

        for ln in self.data:
            line = ln.rstrip().split("\t")
            coverage_value = float(line[coverage_column_name])
            current_window = int(line[window_column_name])

            if between_region_flag:
                between_regions_coverage_dict[coverage_value] += 1

            if coverage_value > self.minimum_coverage:  # and coverage_value < self.maximum_coverage:
                repeat_window += 1
                if repeat_window == repeat_window_number and start_coordinate is None:
                    start_coordinate = (current_window - repeat_window +
                                        1) * window_size
                    repeat_window = 0
            elif start_coordinate is not None and coverage_value <= self.minimum_coverage:
                stop_coordinate = current_window * window_size
                coordinates.append([start_coordinate, stop_coordinate])
                if between_region_flag:
                    median_between_regions_list.append(
                        CoveragesMetrics(
                            between_regions_coverage_dict).median_value())
                    between_regions_coverage_dict.clear()
                if coordinates:
                    between_region_flag = True
                start_coordinate = None
                repeat_window = 0
            else:
                repeat_window = 0
        if coordinates[-1][-1] != current_window:
            coordinates.append([(stop_coordinate + window_size),
                                current_window * window_size])
        if between_regions_coverage_dict:
            median_between_regions_list.append(
                CoveragesMetrics(between_regions_coverage_dict).median_value())
            between_regions_coverage_dict.clear()

        # print(median_between_regions_list)
        # print(coordinates)
        return coordinates, median_between_regions_list
    def get_whole_genome_stats(self):
        df_whole_genome = pd.DataFrame(
            columns=['median', 'average', 'max', 'min'])
        genome_coverages_amounts_dict = Counter()

        for line in self.data:
            line = line.rstrip().split('\t')
            genome_coverages_amounts_dict[float(line[3])] += (int(line[2]) -
                                                              int(line[1]))

        # processing residual data after a cycle
        metrics = CoveragesMetrics(genome_coverages_amounts_dict)
        # print('whole genome metrics is being processing')
        df_whole_genome.loc['whole_genome'] = [
            metrics.median_value(),
            metrics.average_value(),
            metrics.max_coverage_value(),
            metrics.min_coverage_value()
        ]
        # for print to terminal
        # print(df_whole_genome)
        # create a report.csv
        df_whole_genome.rename_axis('#genome').reset_index().to_csv(
            self.output + "_whole_genome_stats.csv",
            encoding='utf-8',
            sep='\t',
            index=False)
Beispiel #3
0
def main():
    outfile = metaopen(metaoutput(args.output, ".csv"), "wt")
    frame_coverages_amounts_dict = Counter()
    line_counter = 0
    frame_line_counter = 0

    for ln in args.input:
        line = ln.strip().split()
        line_counter += 1
        frame_line_counter += 1
        frame_coverages_amounts_dict[int(line[1])] += 1
        if frame_line_counter == args.frame_size:
            start = line_counter - args.frame_size + 1
            stop = line_counter
            metrics = CoveragesMetrics(frame_coverages_amounts_dict)
            coverage = metrics.median_value()
            outfile.write("\t".join(
                ["MT", str(start), str(stop),
                 str(coverage)]) + "\n")
            frame_coverages_amounts_dict.clear()
            frame_line_counter = 0
    if frame_coverages_amounts_dict:
        start = line_counter - sum(frame_coverages_amounts_dict.values()) + 1
        stop = line_counter
        metrics = CoveragesMetrics(frame_coverages_amounts_dict)
        coverage = metrics.median_value()
        outfile.write(
            "\t".join(["MT", str(start),
                       str(stop), str(coverage)]) + "\n")
    def get_nonoverlapping_windows_stats(self, frame_size):
        df_nonoverlapping_frames = pd.DataFrame(columns=['#scaffold', 'frame', 'median', 'average', 'max', 'min'])

        frame_coverages_amounts_dict = Counter()
        frame_line_counter = 0
        frame_id = -1
        index = 0
        previous_scaffold_name = None

        for line in self.data:
            line = line.rstrip().split('\t')
            if previous_scaffold_name == line[0] or previous_scaffold_name is None:
                frame_line_counter += 1
                frame_coverages_amounts_dict[float(line[2])] += 1
            else:
                frame_id = -1
                frame_line_counter = 1
                frame_coverages_amounts_dict.clear()
                frame_coverages_amounts_dict[float(line[2])] += 1
            # for window (non-overlapping)
            if frame_line_counter == frame_size:
                index += 1
                frame_id += 1
                metrics = CoveragesMetrics(frame_coverages_amounts_dict)
                # print('non-overlapping windows metrics is being processing')
                df_nonoverlapping_frames.loc[index] = [previous_scaffold_name, frame_id, metrics.median_value(),
                                                                            metrics.average_value(),
                                                                            metrics.max_coverage_value(),
                                                                            metrics.min_coverage_value()]
                frame_coverages_amounts_dict.clear()
                frame_line_counter = 0
            previous_scaffold_name = line[0]

        #for print dataframe to terminal
        # print(df_nonoverlapping_frames)
        # create a report.csv
        df_nonoverlapping_frames.to_csv(self.output + '_' + str(frame_size) + "_windows_stats.csv",
                                        encoding='utf-8', sep='\t', index = False)
    def get_universal_windows_stats(self, frame_size, frame_shift):
        df_overlapping_frames = pd.DataFrame(columns=['#scaffold', 'frame', 'median', 'average', 'max', 'min'])
        data = self.data.readlines()
        coverages_dict = Counter()
        frame_id = -1 # for numbering from 0
        index = 0
        gap_counter = 0

        for ln in range(0, len(data), frame_shift):
            try:
                scaffold_name = data[ln + gap_counter].rstrip().split('\t')[0]
                last_scaffold_name = data[ln - 1 + frame_size + gap_counter].rstrip().split('\t')[0]
                # next_scaffold_name = data[ln + frame_size + gap_counter].rstrip().split('\t')[0]
                # print("actual scaffold:", scaffold_name)
                # print("last scaffold:", last_scaffold_name)

                if scaffold_name != last_scaffold_name:
                    for gap in range(frame_size + 1):
                        scaffold_name = data[ln + gap_counter + gap].rstrip().split('\t')[0]
                        # print("ELSE: now and next", scaffold_name, last_scaffold_name)
                        if scaffold_name == last_scaffold_name:
                            gap_counter += gap
                            # print("gap_counter:", gap_counter)
                            scaffold_name = data[ln + gap_counter].rstrip().split('\t')[0]
                            last_scaffold_name = data[ln - 1 + frame_size + gap_counter].rstrip().split('\t')[0]
                            # next_scaffold_name = data[ln + frame_size + gap_counter].rstrip().split('\t')[0]
                            break

                if scaffold_name == last_scaffold_name:
                    for j in range(frame_size):
                        line = data[ln + gap_counter + j].rstrip().split('\t')
                        # print("data line:", line)
                        coverages_dict[float(line[2])] += 1
                        if j == frame_size - 1:
                            index += 1
                            frame_id += 1
                            metrics = CoveragesMetrics(coverages_dict)
                            # print('universal windows metrics is being processing')
                            df_overlapping_frames.loc[index] = [scaffold_name,
                                                                frame_id, 
                                                                metrics.median_value(),
                                                                metrics.average_value(),
                                                                metrics.max_coverage_value(),
                                                                metrics.min_coverage_value()]
                            coverages_dict.clear()
            except IndexError:
                break
            
        #for print dataframe to terminal
        # print(df_overlapping_frames)
        # create a report.csv
        df_overlapping_frames.to_csv(self.output + '_' + str(frame_size) + "_windows_stats.csv", encoding='utf-8', sep='\t', index = False)
    def get_scaffolds_stats(self):
        df_scaffolds = pd.DataFrame(columns=['median', 'average', 'max', 'min'])
        scaffold_coverages_dict = Counter()
        previous_scaffold_name = None

        for line in self.data:
            line = line.rstrip().split('\t')
            if previous_scaffold_name != line[0] and previous_scaffold_name != None:
                metrics = CoveragesMetrics(scaffold_coverages_dict)
                # print('scaffolds metrics is being processing')
                df_scaffolds.loc[previous_scaffold_name] = [metrics.median_value(),
                                                            metrics.average_value(),
                                                            metrics.max_coverage_value(),
                                                            metrics.min_coverage_value()]
                scaffold_coverages_dict.clear()
            scaffold_coverages_dict[float(line[2])] += 1
            previous_scaffold_name = line[0]
        # processing residual data after a cycle
        metrics = CoveragesMetrics(scaffold_coverages_dict)
        # print('scaffolds metrics is being processing')
        df_scaffolds.loc[previous_scaffold_name] = [metrics.median_value(),
                                                    metrics.average_value(),
                                                    metrics.max_coverage_value(),
                                                    metrics.min_coverage_value()]
        #for print dataframe to terminal
        # print(df_scaffolds)
        # create a report.csv
        df_scaffolds.rename_axis('#scaffold').reset_index().to_csv(self.output + "_scaffolds_stats.csv",
                                                                  encoding='utf-8', sep='\t', index = False)