def _load_data(self, fp): """Internal: populate with data from external file Arguments fp: file-like object opened for reading which contains sample sheet data """ section = None for i, line in enumerate(fp): line = line.rstrip() logging.debug(line) if not line: # Skip blank lines continue if line.startswith('['): # New section try: i = line.index(']') section = line[1:i] continue except ValueError: logging.error("Bad line (#%d): %s" % (i + 1, line)) if section == 'Header': # Header lines are comma-separated PARAM,VALUE lines self._set_param_value(line, self._header) elif section == 'Reads': # Read lines are one value per line value = line.rstrip(',') if value: self._reads.append(value) elif section == 'Settings': # Settings lines are comma-separated PARAM,VALUE lines self._set_param_value(line, self._settings) elif section == 'Data': # Store data in TabFile object if self._data is None: # Initialise TabFile using this first line # to set the header self._data = TabFile.TabFile(column_names=line.split(','), delimiter=',') else: self._data.append(tabdata=line) elif section is None: raise IlluminaDataError("Not a valid IEM sample sheet?") else: raise IlluminaDataError( "Unrecognised section '%s': not a valid IEM sample sheet?" % section) # Clean up data items: remove surrounding whitespace if self._data is not None: for line in self._data: for item in self._data.header(): try: line[item] = line[item].strip() except AttributeError: pass
def annotate_feature_data(gff_lookup,feature_data_file,out_file): """Annotate feature data with gene information Reads in 'feature data' from a tab-delimited input file with feature IDs in the first column; outputs these data with data about the parent gene appended to each line. Arguments: gff_lookup populated GFFAnnotationLookup instance feature_data_file input data file with feature IDs in first column out_file name of output file """ # Read the feature data into a TabFile print "Reading in data from %s" % feature_data_file feature_data = TabFile.TabFile(filen=feature_data_file, first_line_is_header=True) # Append columns for annotation print "Appending columns for annotation" for colname in ('exon_parent', 'feature_type_exon_parent', 'gene_ID', 'gene_name', 'chr', 'start', 'end', 'strand', 'gene_length', 'locus', 'description'): feature_data.appendColumn(colname) for line in feature_data: feature_ID = line[0] annotation = gff_lookup.getAnnotation(feature_ID) line['exon_parent'] = annotation.parent_feature_name line['feature_type_exon_parent'] = annotation.parent_feature_type line['gene_ID'] = annotation.parent_feature_parent line['gene_name'] = annotation.parent_gene_name line['chr'] = annotation.chr line['start'] = annotation.start line['end'] = annotation.end line['strand'] = annotation.strand line['gene_length'] = annotation.gene_length line['locus'] = annotation.gene_locus line['description'] = annotation.description # Output print "Writing output file %s" % out_file feature_data.write(out_file,include_header=True,no_hash=True)
def annotate_htseq_count_data(gff_lookup,htseq_files,out_file): """Annotate count data from htseq-count output with gene information Reads in data from one or more htseq-count output files and combines into a single tab-delimited output file where the counts for each feature have been appended to data about the parent gene. Also creates an output 'stats' file which combines the summary data from the tail of each htseq-count file. Arguments: gff_lookup: populated GFFAnnotationLookup instance htseq_files: list of output files from htseq-count to use as input out_file: name of output file """ # Output files annotated_counts_out_file = out_file tables_out_file = \ os.path.join(os.path.dirname(annotated_counts_out_file), os.path.splitext(os.path.basename(annotated_counts_out_file))[0]+\ "_stats"+os.path.splitext(annotated_counts_out_file)[1]) # Process the HTSeq-count files print "Processing HTSeq-count files" htseq_data = {} for htseqfile in htseq_files: print "\t%s" % htseqfile htseq_data[htseqfile] = HTSeqCountFile(htseqfile) # Create a TabFile for output print "Building annotated count file for output" annotated_counts = TabFile.TabFile(column_names=['exon_parent', 'feature_type_exon_parent', 'gene_ID', 'gene_name', 'chr', 'start', 'end', 'strand', 'gene_length', 'locus', 'description']) for htseqfile in htseq_files: annotated_counts.appendColumn(htseqfile) # Combine feature counts and parent feature data for feature_ID in htseq_data[htseq_files[0]].feature_IDs(): # Get annotation data annotation = gff_lookup.getAnnotation(feature_ID) # Build the data line data = [annotation.parent_feature_name, annotation.parent_feature_type, annotation.parent_feature_parent, annotation.parent_gene_name, annotation.chr, annotation.start, annotation.end, annotation.strand, annotation.gene_length, annotation.gene_locus, annotation.description] # Add the counts from each file for htseqfile in htseq_files: data.append(htseq_data[htseqfile].count(feature_ID)) # Add to the tabfile annotated_counts.append(data=data) # Write the file print "Writing output file %s" % annotated_counts_out_file annotated_counts.write(annotated_counts_out_file,include_header=True,no_hash=True) # Make second file for the trailing table data print "Building trailing tables data file for output" table_counts = TabFile.TabFile(column_names=['count']) for htseqfile in htseq_files: table_counts.appendColumn(htseqfile) for name in htseq_data[htseq_files[0]].table(): # Build the data line data = [name] for htseqfile in htseq_files: data.append(htseq_data[htseqfile].table()[name]) table_counts.append(data=data) print "Writing output file %s" % tables_out_file table_counts.write(tables_out_file,include_header=True,no_hash=True)