Example #1
0
    def format_fastqc_graphs(self, rawDataPath, currSample):

        ## Object to get data from FastQC output
        fqc_object = Fadapa(rawDataPath)

        ## Target output dictionary
        fastqc_graphdata = {}

        ## Unextracted data
        fqc_pbsq_data = fqc_object.clean_data('Per base sequence quality')
        fqc_pbnc_data = fqc_object.clean_data('Per base N content')
        fqc_seqlen_data = fqc_object.clean_data('Sequence Length Distribution')

        ##
        ## Per Base Pair Sequence Quality
        ## min = item[5], q1 = item[3], median = item[2], q3 = item[4], max = item[6]
        pbsq_labels = []; pbsq_values = []; pbsq_means = []
        for item in fqc_pbsq_data[1:]:
            pbsq_labels.append(item[0]) ## label for bin
            pbsq_means.append(int(float(item[1]))) ## sample running mean
            bin_values = [item[5],item[3],item[2],item[4],item[6]]
            bin_values = [0.0 if x=='NaN' else x for x in bin_values] ## replace NaN with 0
            bin_values = [int(float(x)) for x in bin_values] ## convert str of float->float->int
            pbsq_values.append(bin_values)
        fastqc_graphdata['PBSQ_TITLE'] = 'FastQC Per base sequence quality'
        fastqc_graphdata['PBSQ_LABELS'] = str(pbsq_labels)
        fastqc_graphdata['PBSQ_VALUES'] = str(pbsq_values)
        fastqc_graphdata['PBSQ_MEANVAL'] = str(pbsq_means)
        fastqc_graphdata['PBSQ_DESCR'] = 'Per base sequence quality'
        fastqc_graphdata['PBSQ_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBSQ_Y'] = 'PHRED quality score'

        ##
        ## Per Base Pair N Content
        fastqc_graphdata['PBNC_TITLE'] = 'FastQC Per base N content for {}'.format(currSample)
        pbnc_labels = []; pbnc_values = []
        for item in fqc_pbnc_data[1:]:
            pbnc_labels.append(item[0]); pbnc_values.append(item[1])
        fastqc_graphdata['PBNC_LABELS'] = str(pbnc_labels)
        fastqc_graphdata['PBNC_VALUES'] = str(pbnc_values)
        fastqc_graphdata['PBNC_DESCR'] = 'N content per base'
        fastqc_graphdata['PBNC_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBNC_Y'] = 'Percentage content (%)'

        ##
        ## Sequence Length Distribution
        fastqc_graphdata['SQLD_TITLE'] = 'FastQC Sequence length distribution for {}'.format(currSample)
        dist_labels = []; dist_values = []
        for item in fqc_seqlen_data[1:]:
            dist_labels.append(item[0]); dist_values.append(item[1])
        fastqc_graphdata['SQLD_LABELS'] = str(dist_labels)
        fastqc_graphdata['SQLD_VALUES'] = str(dist_values)
        fastqc_graphdata['SQLD_DESCR'] = 'Sequence length population'
        fastqc_graphdata['SQLD_X'] = 'Sequence length (BP)'
        fastqc_graphdata['SQLD_Y'] = 'Population (#)'

        return fastqc_graphdata
Example #2
0
	def FastQC(self):

		"""
		Run FastQC on target files
		Extract information from output
		Set it to object attributes as required
		:return: NoThInG
		"""

		## Target SeqQC/fastqc-stage outdir
		io_trunk = self.sequencepair_object.get_qcpath()
		target_output = os.path.join(io_trunk, self.stage)

		## Run process on specific data (init/trimmed/etc)
		fqfile = self.sequencepair_object.get_forwardfastq()
		force_mkdir(target_output)
		fastqc_process = subprocess.Popen(
			['fastqc', '--quiet', '--extract', '-t', THREADS, '-o', target_output, fqfile], stdout=subprocess.PIPE,
			stderr=subprocess.PIPE)
		fastqc_process.wait()

		## Remove ZIP of results
		for candidate in glob.glob(os.path.join(target_output,'*.zip')): os.remove(candidate)

		## Get path for fastqc_data.txt for current execution
		target_file = ''
		for root, dirs, files in os.walk(target_output):
			for name in files:
				if name.endswith('fastqc_data.txt'):
					target_file = os.path.join(root, name)

		## Number of reads present; for end-report i/o
		## Append path to FQC report so we can scrape at will
		f = Fadapa(target_file)
		stats = f.clean_data('Basic Statistics')
		pbsq = f.clean_data('Per base sequence quality')
		read_count = [x for x in stats if 'Total Sequences' in x][0][1]
		gc_pcnt = [x for x in stats if '%GC' in x][0][1]

		if self.stage == 'Initial':
			self.sequencepair_object.set_initial_readcount(read_count)
			self.sequencepair_object.set_initial_fastqc(target_file)
			self.sequencepair_object.set_initial_pbsq(pbsq)
			self.sequencepair_object.set_initial_gcpcnt(gc_pcnt)
		if self.stage == 'PostDMPX':
			self.sequencepair_object.set_postdmpx_readcount(read_count)
			self.sequencepair_object.set_postdmpx_fastqc(target_file)
			self.sequencepair_object.set_postdmpx_pbsq(pbsq)
			self.sequencepair_object.set_postdmpx_gcpcnt(gc_pcnt)
		if self.stage == 'PostTrim':
			self.sequencepair_object.set_posttrim_readcount(read_count)
			self.sequencepair_object.set_posttrim_fastqc(target_file)
			self.sequencepair_object.set_posttrim_pbsq(pbsq)
			self.sequencepair_object.set_posttrim_gcpcnt(gc_pcnt)
def fastqc_parser(fastqcDir, filename):
    '''FASTQC PARSER
    Extracts info from fastqc output files --> currently uses FADAPA parser
    '''
    fastqcD = {}
    fastqcF = sorted(glob.glob(os.path.join(fastqcDir, "*_fastqc", filename)))
    for f in fastqcF:
        f_1 = Fadapa(f)
        basicStats = f_1.clean_data('Basic Statistics')
        fName = basicStats[1][1].rsplit('.f')[0]
        numSeqs = basicStats[4][1]
        longestRead = f_1.clean_data('Basic Statistics')[6][
            1]  #.rsplit('-')[1]
        fastqcD[fName] = [numSeqs, longestRead]
    return fastqcD
Example #4
0
def parse_fastqc(fqc):
    assert op.isfile(fqc), "%s not exist" % fqc
    qc = Fadapa(fqc)
    r = dict()
    for ary in qc.clean_data('Basic Statistics'):
        r[ary[0]] = ary[1]
    return r
Example #5
0
def parse_fastqc(fqc):
    assert op.isfile(fqc), "%s not exist" % fqc
    qc = Fadapa(fqc)
    r = dict()
    for ary in qc.clean_data('Basic Statistics'):
        r[ary[0]] = ary[1]
    return r
Example #6
0
    def format_fastqc(self, rawDataPath, currSample):

        ## FastQC templates
        fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html')

        ## just f****n lump it all in there for now and figure out what you want to format next
        fqc_object = Fadapa(rawDataPath)

        ## Module status data
        module_summary = fqc_object.summary()
        module_stats = module_summary[1][0]
        module_pbsq = module_summary[2][0]
        module_ptsq = module_summary[3][0]
        module_psqs = module_summary[4][0]
        module_pbsc = module_summary[5][0]
        module_psgcc = module_summary[6][0]
        module_pbnc = module_summary[7][0]
        module_seqlendist = module_summary[8][0]
        module_seqdup = module_summary[9][0]
        module_overrep = module_summary[10][0]
        module_adapter = module_summary[11][0]

        ## Basic statistics data
        basic_stats = fqc_object.clean_data('Basic Statistics')
        file_name = basic_stats[1][1]
        file_type = basic_stats[2][1]
        encoding = basic_stats[3][1]
        total_sequences = basic_stats[4][1]
        poor_quality = basic_stats[5][1]
        seq_len = basic_stats[6][1]
        gc_pcnt = basic_stats[7][1]

        ## FastQC html template file with data inserted
        fqc_return = ''
        f = open(fastqc_template, 'r')
        for line in f:
            line = line.format(MODULE_STATS=module_stats,
                               MODULE_PBSQ=module_pbsq,
                               MODULE_PTSQ=module_ptsq,
                               MODULE_PSQS=module_psqs,
                               MODULE_PBSC=module_pbsc,
                               MODULE_PSGCC=module_psgcc,
                               MODULE_PBNC=module_pbnc,
                               MODULE_SEQLENDIST=module_seqlendist,
                               MODULE_SEQDUP=module_seqdup,
                               MODULE_OVERREP=module_overrep,
                               MODULE_ADAPTER=module_adapter,
                               FQC_FILENAME=file_name,
                               FQC_FILETYPE=file_type,
                               FQC_ENCODING=encoding,
                               FQC_TOTALSEQ=total_sequences,
                               FQC_POORQUAL=poor_quality,
                               FQC_SEQLEN=seq_len,
                               FQC_GCPCNT=gc_pcnt)
            fqc_return = '{0}{1}'.format(fqc_return, line)
        f.close()

        ## return formatted FastQC report
        return fqc_return
Example #7
0
class TestFadapa(unittest.TestCase):
    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
Example #8
0
class TestFadapa(unittest.TestCase):

    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
Example #9
0
    def format_fastqc(self, rawDataPath, currSample):

        ## FastQC templates
        fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html')

        ## just f****n lump it all in there for now and figure out what you want to format next
        fqc_object = Fadapa(rawDataPath)

        ## Module status data
        module_summary = fqc_object.summary()
        module_stats = module_summary[1][0]; module_pbsq = module_summary[2][0]; module_ptsq = module_summary[3][0];
        module_psqs = module_summary[4][0]; module_pbsc = module_summary[5][0]; module_psgcc = module_summary[6][0];
        module_pbnc = module_summary[7][0]; module_seqlendist = module_summary[8][0]; module_seqdup = module_summary[9][0];
        module_overrep = module_summary[10][0]; module_adapter = module_summary[11][0]

        ## Basic statistics data
        basic_stats = fqc_object.clean_data('Basic Statistics')
        file_name = basic_stats[1][1]; file_type = basic_stats[2][1]; encoding = basic_stats[3][1]
        total_sequences = basic_stats[4][1]; poor_quality = basic_stats[5][1]; seq_len = basic_stats[6][1]
        gc_pcnt = basic_stats[7][1]

        ## FastQC html template file with data inserted
        fqc_return = ''
        f = open(fastqc_template, 'r')
        for line in f:
            line = line.format(
            MODULE_STATS = module_stats, MODULE_PBSQ = module_pbsq, MODULE_PTSQ = module_ptsq,
            MODULE_PSQS = module_psqs, MODULE_PBSC = module_pbsc, MODULE_PSGCC = module_psgcc,
            MODULE_PBNC = module_pbnc, MODULE_SEQLENDIST = module_seqlendist, MODULE_SEQDUP = module_seqdup,
            MODULE_OVERREP = module_overrep, MODULE_ADAPTER = module_adapter,
            FQC_FILENAME = file_name, FQC_FILETYPE = file_type, FQC_ENCODING = encoding,
            FQC_TOTALSEQ = total_sequences, FQC_POORQUAL = poor_quality, FQC_SEQLEN = seq_len,
            FQC_GCPCNT = gc_pcnt
            )
            fqc_return = '{0}{1}'.format(fqc_return, line)
        f.close()

        ## return formatted FastQC report
        return fqc_return
Example #10
0
name = sys.argv[1]

#load file into fadapa parser
f = Fadapa('/home/graham/Downloads/trial/' + name + '_fastqc/fastqc_data.txt')

#get raw data for Overrepresented sequences
pass_seq = f.raw_data('Overrepresented sequences')[0]

#Initialise list of seqs
list_of_seqs = []

#If there are no overrepresented sequences, the clean parser breaks!
#Therefore, we cannot reference .clean unless .raw contains something
if pass_seq != ">>Overrepresented sequences	pass":
    #Loop through the .clean parsed data
    for data in f.clean_data('Overrepresented sequences'):
        #Add the first index of the .clean data to list
        #First entry will by #Sequence, subsequent will be the actual seqs
        list_of_seqs.append(data[0])

#Create empty output string
output = ""

#Loop through the list of sequeces from index 1 onwards (as the index 0 will be #Sequence)
for things in list_of_seqs[1:]:
    #Concatenates seqs in format required for cutadapt argument (second pair)
    output = output + " -A " + things

#Prints the cutadapt argument - this sends it the stdout, allowing it to feed into the bash script
print(output)
Example #11
0
def reorganize(sample_dir):
    try:
        assert (os.path.isdir(sample_dir))
    except:
        sys.stderr.write(
            "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n")
        raise RuntimeError

    sample_dir = os.path.abspath(sample_dir) + '/'

    # set up directory structure
    workspace_name = "LSARP_Results/"
    workspace = sample_dir + workspace_name
    if not os.path.isdir(workspace):
        workspace = uF.setupDirectory(sample_dir, workspace_name)

    # create logging object
    log_file = workspace + 'LSARP_Table_Creation.log'
    logObject = uF.createLoggerObject(log_file)

    sample = sample_dir.split('/')[-2]
    logObject.info("Creating easy upload formats for sample %s", sample)
    logObject.info("-" * 80)

    # FASTQC Tables

    logObject.info('Creating FastQC Data Tables.')
    logObject.info('-' * 80)

    FastQC_results = 'FastQC/'
    FastQC_results_workspace = workspace + FastQC_results
    fastqc_modules = [
        'Per base sequence quality', 'Per tile sequence quality',
        'Per sequence quality scores', 'Per base sequence content',
        'Per sequence GC content', 'Per base N content',
        'Sequence Length Distribution', 'Sequence Duplication Levels',
        'Overrepresented sequences', 'Adapter Content'
    ]

    try:
        fastqc_zipped_data_dirs = [
            sample_dir + 'FastQC/' + zd
            for zd in os.listdir(sample_dir + 'FastQC/') if zd.endswith('.zip')
        ]
        assert (len(fastqc_zipped_data_dirs) > 0)
        for zd in fastqc_zipped_data_dirs:
            assert (os.path.isfile(zd))
        if not os.path.isdir(FastQC_results_workspace):
            FastQC_results_workspace = uF.setupDirectory(
                workspace, FastQC_results)
    except:
        logObject.error(
            'No FastQC results available or path is unable to be determined!')
    else:
        for zd in fastqc_zipped_data_dirs:
            with zipfile.ZipFile(zd) as z:
                for filename in z.namelist():
                    if filename.split('/')[-1] == 'fastqc_data.txt':
                        with z.open(filename) as fh:
                            FastQC_tmp_out = open(
                                FastQC_results_workspace + 'tmp.txt', 'wb')
                            for line in fh:
                                FastQC_tmp_out.write(line)
                            FastQC_tmp_out.close()
                            fadapa = Fadapa(FastQC_results_workspace +
                                            'tmp.txt')
                            for module in fastqc_modules:
                                try:
                                    table_file = '_'.join(module.split())
                                    cleaned_module_data = fadapa.clean_data(
                                        module)
                                    if cleaned_module_data:
                                        table_handle = open(
                                            FastQC_results_workspace +
                                            table_file + '.table.txt', 'w')
                                        for i, split_line in enumerate(
                                                cleaned_module_data):
                                            if i == 0:
                                                split_line = [
                                                    'sample', 'read'
                                                ] + split_line
                                            else:
                                                split_line = [
                                                    sample_dir.split('/')[-2],
                                                    zd.split('/')[-1].split(
                                                        sample_dir.split('/')
                                                        [-2] + '_')[1].split(
                                                            '_fastqc.zip')
                                                    [0].split('.')[0]
                                                ] + split_line
                                            table_handle.write(
                                                '\t'.join(split_line) + '\n')
                                        table_handle.close()
                                except:
                                    pass
                            os.system('rm -f %s' % FastQC_results_workspace +
                                      'tmp.txt')
    logObject.info('*' * 80)

    # Centrifuge Tables

    logObject.info('Creating Centrifuge Data Tables.')
    logObject.info('-' * 80)

    Centrifuge_results = 'Centrifuge/'
    Centrifuge_results_workspace = workspace + Centrifuge_results

    centrifuge_report_file = sample_dir + 'Centrifuge/' + sample_dir.split(
        '/')[-2] + '_centrifuge_report.tsv'
    kraken_report_file = sample_dir + 'Centrifuge/' + sample_dir.split(
        '/')[-2] + '_centrifuge_kraken_report.txt'

    try:
        assert (os.path.isfile(centrifuge_report_file)
                and os.path.isfile(kraken_report_file))
        if not os.path.isdir(Centrifuge_results_workspace):
            Centrifuge_results_workspace = uF.setupDirectory(
                workspace, Centrifuge_results)

        centrifuge_report_table_file = Centrifuge_results_workspace + 'centrifuge_report.table.txt'
        centrifuge_report_table_handle = open(centrifuge_report_table_file,
                                              'w')

        centrifuge_report_data = defaultdict(lambda: ['NA'] * 6)
        for i, line in enumerate(open(centrifuge_report_file)):
            if i > 0:
                line = line.rstrip('\n')
                name, taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = line.split(
                    '\t')
                centrifuge_report_data[name] = [
                    taxID, taxRank, genomeSize, numReads, numUniqueReads,
                    abundance
                ]

        header = [
            'sample', 'taxonomy_name', 'taxonomy_level', 'taxonomy_rank',
            'taxonomy_id', 'genome_size', 'centrifuge_abundance',
            'percentage_of_fragments_recursively_covered',
            'number_of_fragments_recursively_included',
            'number_of_fragments_direct'
        ]
        centrifuge_report_table_handle.write('\t'.join(header) + '\n')
        for i, line in enumerate(open(kraken_report_file)):
            line = line.rstrip('\n')
            prop, frag_recurse, frag_direct, tax_level, tax_id = line.split(
            )[:5]
            tax = ' '.join(line.split()[5:]).strip()
            taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = centrifuge_report_data[
                tax]
            centrifuge_report_table_handle.write('\t'.join([
                sample_dir.split('/')[-2], tax, tax_level, taxRank, taxID,
                genomeSize, abundance, prop, frag_recurse, frag_direct
            ]) + '\n')

        centrifuge_report_table_handle.close()
    except:
        logObject.error('No Centrifuge results available!')

    logObject.info('*' * 80)

    # AMRP Tables

    logObject.info('Moving Results from ARIBA and ShortBRED AMR Searches.')
    logObject.info('-' * 80)

    AMRP_results = 'AMRP_Searches/'
    AMRP_results_workspace = workspace + AMRP_results

    try:
        AMRP_dir = sample_dir + 'AMRP/'
        assert (os.path.isdir(AMRP_dir))
        if not os.path.isdir(workspace + AMRP_dir):
            AMRP_results_workspace = uF.setupDirectory(workspace, AMRP_results)
        for sd in os.listdir(AMRP_dir):
            ariba_dir = AMRP_dir + sd + '/'
            ariba_report = ariba_dir + 'report.tsv'
            if os.path.isfile(ariba_report):
                ariba_result = AMRP_results_workspace + sample_dir.split(
                    '/')[-2] + '_' + sd + '_ariba_results.txt'
                os.system('cp %s %s' % (ariba_report, ariba_result))
    except:
        logObject.error('Unable to create AMR prediction data tables.'
                        )  # Raising exception now ...')

    logObject.info('*' * 80)

    # MLST Tables

    logObject.info('Creating MLST Data Tables.')
    logObject.info('-' * 80)

    MLST_results = 'MLST/'
    MLST_results_workspace = workspace + MLST_results

    try:
        MLST_dir = sample_dir + 'MLST/'
        MLST_result_file = MLST_dir + 'ariba_mlst/mlst_report.tsv'

        if not os.path.isdir(MLST_results_workspace):
            MLST_results_workspace = uF.setupDirectory(workspace, MLST_results)
        os.system('cp %s %s' % (MLST_result_file, MLST_results_workspace))

    except:
        logObject.error('Unable to create MLST call data tables.'
                        )  # Raising exception now ...')
        #raise RuntimeError

    logObject.info('*' * 80)

    # De Novo Assembly Storage

    logObject.info('Moving de novo assembly to results directory.')
    logObject.info('-' * 80)

    Assembly_results = 'Assembly/'
    Assembly_results_workspace = workspace + Assembly_results

    try:
        Assembly_dir = sample_dir + 'Assembly/'
        Assembly_original_location = Assembly_dir + 'assembly.fasta'
        if not os.path.isfile(Assembly_original_location):
            Assembly_original_location = Assembly_dir + 'scaffolds.fasta'
        assert (os.path.isfile(Assembly_original_location))
        if not os.path.isdir(Assembly_results_workspace):
            Assembly_results_workspace = uF.setupDirectory(
                workspace, Assembly_results)
        Assembly_new_location = Assembly_results_workspace + sample_dir.split(
            '/')[-2] + '.genome.fa'
        os.system('cp %s %s' %
                  (Assembly_original_location, Assembly_new_location))
    except:
        logObject.error('Unable to move assembly to results directory.')

    logObject.info('*' * 80)

    # Assembly QC Storage

    logObject.info('Moving GAEMR assembly QC to results directory.')
    logObject.info('-' * 80)

    try:
        Assembly_QC_new_location = workspace + 'Assembly_QC/'
        Assembly_QC_original_dir = sample_dir + 'GAEMR/QC/'
        assert (os.path.isdir(Assembly_QC_original_dir))
        os.system('cp -r %s %s' %
                  (Assembly_QC_original_dir, Assembly_QC_new_location))

    except:
        logObject.error(
            'Unable to move GAEMR assembly QC to results directory.')

    logObject.info('*' * 80)

    # Pilon Results Storage

    logObject.info('Moving Pilon output to results directory.')
    logObject.info('-' * 80)

    try:
        Pilon_new_dir = workspace + 'Reference_Assembly_and_Variant_Calling/'
        Pilon_original_dir = sample_dir + 'Pilon/results/'
        assert (os.path.isdir(Pilon_original_dir))
        os.system('cp -r %s %s' % (Pilon_original_dir, Pilon_new_dir))
        os.system('gzip %s*' % Pilon_new_dir)

    except:
        logObject.error('Unable to move Pilon output to results directory.')

    logObject.info('*' * 80)

    # StrainGST Results Storage

    logObject.info('Moving StrainGST output to results directory.')
    logObject.info('-' * 80)

    try:
        Straingst_result_file = sample_dir + 'StrainGST/' + sample + '.straingst_result.tsv'
        assert (os.path.isfile(Straingst_result_file))
        Straingst_new_dir = 'StrainGST/'
        Straingst_results_workspace = workspace + Straingst_new_dir
        if not os.path.isdir(workspace + Straingst_new_dir):
            Straingst_results_workspace = uF.setupDirectory(
                workspace, Straingst_new_dir)
        os.system('cp %s %s' %
                  (Straingst_result_file, Straingst_results_workspace))

    except:
        logObject.error(
            'Unable to move StrainGST output to results directory.')

    logObject.info('*' * 80)

    uF.closeLoggerObject(logObject)

    # create successful completion file if steps completed!
    conf_file = open(sample_dir + "LSARP.txt", 'w')
    conf_file.write("LSARP Table Creation: Module Completed Succesfully!")
    conf_file.close()
Example #12
0
    def format_fastqc_graphs(self, rawDataPath, currSample):

        ## Object to get data from FastQC output
        fqc_object = Fadapa(rawDataPath)

        ## Target output dictionary
        fastqc_graphdata = {}

        ## Unextracted data
        fqc_pbsq_data = fqc_object.clean_data('Per base sequence quality')
        fqc_pbnc_data = fqc_object.clean_data('Per base N content')
        fqc_seqlen_data = fqc_object.clean_data('Sequence Length Distribution')

        ##
        ## Per Base Pair Sequence Quality
        ## min = item[5], q1 = item[3], median = item[2], q3 = item[4], max = item[6]
        pbsq_labels = []
        pbsq_values = []
        pbsq_means = []
        for item in fqc_pbsq_data[1:]:
            pbsq_labels.append(item[0])  ## label for bin
            pbsq_means.append(int(float(item[1])))  ## sample running mean
            bin_values = [item[5], item[3], item[2], item[4], item[6]]
            bin_values = [0.0 if x == 'NaN' else x
                          for x in bin_values]  ## replace NaN with 0
            bin_values = [int(float(x)) for x in bin_values
                          ]  ## convert str of float->float->int
            pbsq_values.append(bin_values)
        fastqc_graphdata['PBSQ_TITLE'] = 'FastQC Per base sequence quality'
        fastqc_graphdata['PBSQ_LABELS'] = str(pbsq_labels)
        fastqc_graphdata['PBSQ_VALUES'] = str(pbsq_values)
        fastqc_graphdata['PBSQ_MEANVAL'] = str(pbsq_means)
        fastqc_graphdata['PBSQ_DESCR'] = 'Per base sequence quality'
        fastqc_graphdata['PBSQ_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBSQ_Y'] = 'PHRED quality score'

        ##
        ## Per Base Pair N Content
        fastqc_graphdata[
            'PBNC_TITLE'] = 'FastQC Per base N content for {}'.format(
                currSample)
        pbnc_labels = []
        pbnc_values = []
        for item in fqc_pbnc_data[1:]:
            pbnc_labels.append(item[0])
            pbnc_values.append(item[1])
        fastqc_graphdata['PBNC_LABELS'] = str(pbnc_labels)
        fastqc_graphdata['PBNC_VALUES'] = str(pbnc_values)
        fastqc_graphdata['PBNC_DESCR'] = 'N content per base'
        fastqc_graphdata['PBNC_X'] = 'Position in read (BP)'
        fastqc_graphdata['PBNC_Y'] = 'Percentage content (%)'

        ##
        ## Sequence Length Distribution
        fastqc_graphdata[
            'SQLD_TITLE'] = 'FastQC Sequence length distribution for {}'.format(
                currSample)
        dist_labels = []
        dist_values = []
        for item in fqc_seqlen_data[1:]:
            dist_labels.append(item[0])
            dist_values.append(item[1])
        fastqc_graphdata['SQLD_LABELS'] = str(dist_labels)
        fastqc_graphdata['SQLD_VALUES'] = str(dist_values)
        fastqc_graphdata['SQLD_DESCR'] = 'Sequence length population'
        fastqc_graphdata['SQLD_X'] = 'Sequence length (BP)'
        fastqc_graphdata['SQLD_Y'] = 'Population (#)'

        return fastqc_graphdata
Example #13
0
    def fastqc_result(self, r1, r2, sample, output, type):
        fq1 = Fadapa(r1)
        fq2 = Fadapa(r2)
        fastqc = {}
        fastqc_summary = {}
        fastqc_pass = {}
        fastqc_per_base_quality = {}
        fastqc_per_sequence_quality = {}
        fastqc_per_sequence_quality_r1 = {}
        fastqc_per_sequence_quality_r2 = {}
        fastqc_sequence_length_distribution = {}
        fastqc_sequence_length_distribution_r1 = {}
        fastqc_sequence_length_distribution_r2 = {}

        # fastqc per sequence quality scores for report
        # R1
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r1.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()
        # R2
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r2.txt" % (output, sample),
            "w")
        count = 0
        for data in fq2.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc per base squence for report
        f = open(
            "%s/data/stat/%s.base_sequence_quality.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per base sequence quality'):
            if (count > 3):
                break
            if data.startswith(">>Per") or data.startswith(
                    "#Base") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc sequence length distribution for report
        f = open(
            "%s/data/stat/%s.sequence_length_distribution.txt" %
            (output, sample), "w")
        count = 0
        for data in fq1.raw_data('Sequence Length Distribution'):
            if (count > 3):
                break
            if data.startswith(">>Seq") or data.startswith(
                    "#") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc parse for json
        for data in fq1.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r1[data[0]] = data[1]
        for data in fq2.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r2[data[0]] = data[1]

        total_reads_r1 = 0
        total_reads_r2 = 0
        above_30_r1 = 0
        above_30_r2 = 0
        for data in fq1.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r1[data[0]] = data[1]
                total_reads_r1 = total_reads_r1 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r1 = above_30_r1 + float(data[1])

        for data in fq2.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r2[data[0]] = data[1]
                total_reads_r2 = total_reads_r2 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r2 = above_30_r2 + float(data[1])

        #mean read quality(percentage of reads with mean Phred base quality above 30)
        mean_read_quality_percentage = (above_30_r1 + above_30_r2) / (
            total_reads_r1 + total_reads_r2) * 100
        mean_read_quality = {}
        mean_read_quality['total_read'] = total_reads_r1 + total_reads_r2
        mean_read_quality['above_30'] = above_30_r1 + above_30_r2
        mean_read_quality['percentage'] = mean_read_quality_percentage
        if type == "raw":
            if 90 < mean_read_quality_percentage:
                mean_read_quality['message'] = "pass"
                self._qc_pass_count = self._qc_pass_count + 1
            else:
                message = "warn"
                mean_read_quality['message'] = "warn"

        for data in fq1.clean_data('Per base sequence quality'):
            #base, mean, median
            if data[0] != "base_Base":
                fastqc_per_base_quality[data[0]] = "{0}:{1}:{2}".format(
                    data[0], data[1], data[2])

        for data in fq1.clean_data('Basic Statistics'):
            if data[0] != "Measure":
                fastqc_summary[data[0]] = data[1]

        for data in fq1.summary():
            if data[1] != "Module Name":
                fastqc_pass[data[1]] = data[0]

        fastqc['mean_read_quality'] = mean_read_quality
        fastqc[
            'fastqc_sequence_length_distribution'] = fastqc_sequence_length_distribution
        fastqc['per_sequence_quality_score'] = fastqc_per_sequence_quality
        fastqc['per_base_quality'] = fastqc_per_base_quality
        fastqc['summary'] = fastqc_summary
        fastqc['pass'] = fastqc_pass
        fastqc['fastq_file_name'] = "%s-%s" % (r1, r2)
        return fastqc