Esempio n. 1
0
    def insert_size(self):
        insert = nesteddict()
        for sample in self.collect_file:
            log = self.collect_file[sample]
            line = self.get_line(log, 'MEDIAN_INSERT_SIZE') + 1  # 从1开始
            rec = getline(log, line).strip().split()
            insert_mean, insert_std = rec[5], rec[6]
            insert[sample]['InsertSize_mean'] = insert_mean
            insert[sample]['Insert_std'] = insert_std
            line = self.get_line(log, 'insert_size')
            result = self.read_his(log, line)
            histogram = pd.DataFrame.from_dict(result, orient='index').T
            histogram['insert_size'] = histogram.index
            # histogram=histogram.rename(columns={'count':'Forward_count'})
            histogram['count'] = histogram['count'].astype(int)
            histogram['insert_size'] = histogram['insert_size'].astype(int)
            histogram = histogram.sort_values('insert_size')
            histogram['count'] = histogram['count'] / sum(histogram['count'])
            histogram.to_csv('{module}/{sample}/insert_size.csv'.format(
                module=self.module, sample=sample),
                             index=False,
                             header=True)
            self.plot_insert_size(histogram, sample)

        df = pd.DataFrame.from_dict(insert, orient='index')
        df = df.reset_index()
        df = df.rename(columns={'index': 'Samples'})
        return df
Esempio n. 2
0
    def raw_reads_content(self):
        before_filtering_dataframe = nesteddict()
        after_filtering_dataframe = nesteddict()
        filtering_result_dataframe = nesteddict()
        sequencing_qc = self.read_json()
        for sample in sequencing_qc:
            try:
                before_filtering = sequencing_qc[sample]['summary'][
                    'before_filtering']
                before_filtering = {
                    'Raw_' + key: before_filtering[key]
                    for key in before_filtering
                }
                after_filtering = sequencing_qc[sample]['summary'][
                    'after_filtering']
                after_filtering = {
                    'Clean_' + key: after_filtering[key]
                    for key in after_filtering
                }
                filtering_result = sequencing_qc[sample]['filtering_result']
                before_filtering_dataframe[sample] = before_filtering
                after_filtering_dataframe[sample] = after_filtering
                filtering_result_dataframe[sample] = filtering_result
                # quality and content png
                read1_after_filtering_quality_curves = sequencing_qc[sample][
                    'read1_after_filtering']['quality_curves']
                read1_after_filtering_content_curves = sequencing_qc[sample][
                    'read1_after_filtering']['content_curves']
                read2_after_filtering_quality_curves = sequencing_qc[sample][
                    'read2_after_filtering']['quality_curves']
                read2_after_filtering_content_curves = sequencing_qc[sample][
                    'read2_after_filtering']['content_curves']
                self.quality_png(read1_after_filtering_quality_curves, 'read1',
                                 sample)
                self.content_png(read1_after_filtering_content_curves, 'read1',
                                 sample)
                self.quality_png(read2_after_filtering_quality_curves, 'read2',
                                 sample)
                self.content_png(read2_after_filtering_content_curves, 'read2',
                                 sample)
            except KeyError:
                continue

        return before_filtering_dataframe, after_filtering_dataframe, filtering_result_dataframe
Esempio n. 3
0
    def read_his(self, file, index):
        result = nesteddict()
        with open(file, 'rt') as f:
            for key, value in enumerate(f):
                if key < index:
                    continue
                else:
                    try:
                        cov, count = value.strip().split()
                        result['count'][cov] = count
                    except ValueError:
                        break

        return result
Esempio n. 4
0
 def dedup_log(self):
     dedup = nesteddict()
     for sample in self.collect_file:
         log = self.collect_file[sample]
         # print(log)
         line = self.get_line(log, 'LIBRARY') + 1
         rec = getline(log, line).strip().split()
         UNPAIRED_READS_EXAMINED, READ_PAIRS_EXAMINED, UNMAPPED_READS, PERCENT_DUPLICATION, ESTIMATED_LIBRARY_SIZE = rec[
             1], rec[2], rec[4], rec[-2], rec[-1]
         #sample[sample]['Unmerged_reads'] = UNPAIRED_READS_EXAMINED
         #sample[sample]['Merged_reads'] = READ_PAIRS_EXAMINED
         dedup[sample]['Mapped_rate'] = round(
             (int(ESTIMATED_LIBRARY_SIZE) - int(UNMAPPED_READS)) /
             int(ESTIMATED_LIBRARY_SIZE), 4)
         dedup[sample]['PCR_duplication'] = PERCENT_DUPLICATION
         #sample[sample]['Library_size'] = ESTIMATED_LIBRARY_SIZE
     df = pd.DataFrame.from_dict(dedup, orient='index')
     df = df.reset_index()
     df = df.rename(columns={'index': 'Samples'})
     return df
Esempio n. 5
0
 def read_json(self):
     sequencing_qc = nesteddict()
     for sample in self.collect_file:
         qc = self.json_in(self.collect_file[sample])
         sequencing_qc[sample] = qc
     return sequencing_qc