コード例 #1
0
ファイル: IlluminaData.py プロジェクト: fw1121/genomics
    def _load_data(self, fp):
        """Internal: populate with data from external file

        Arguments
          fp: file-like object opened for reading which contains
             sample sheet data

        """
        section = None
        for i, line in enumerate(fp):
            line = line.rstrip()
            logging.debug(line)
            if not line:
                # Skip blank lines
                continue
            if line.startswith('['):
                # New section
                try:
                    i = line.index(']')
                    section = line[1:i]
                    continue
                except ValueError:
                    logging.error("Bad line (#%d): %s" % (i + 1, line))
            if section == 'Header':
                # Header lines are comma-separated PARAM,VALUE lines
                self._set_param_value(line, self._header)
            elif section == 'Reads':
                # Read lines are one value per line
                value = line.rstrip(',')
                if value:
                    self._reads.append(value)
            elif section == 'Settings':
                # Settings lines are comma-separated PARAM,VALUE lines
                self._set_param_value(line, self._settings)
            elif section == 'Data':
                # Store data in TabFile object
                if self._data is None:
                    # Initialise TabFile using this first line
                    # to set the header
                    self._data = TabFile.TabFile(column_names=line.split(','),
                                                 delimiter=',')
                else:
                    self._data.append(tabdata=line)
            elif section is None:
                raise IlluminaDataError("Not a valid IEM sample sheet?")
            else:
                raise IlluminaDataError(
                    "Unrecognised section '%s': not a valid IEM sample sheet?"
                    % section)
        # Clean up data items: remove surrounding whitespace
        if self._data is not None:
            for line in self._data:
                for item in self._data.header():
                    try:
                        line[item] = line[item].strip()
                    except AttributeError:
                        pass
コード例 #2
0
def annotate_feature_data(gff_lookup,feature_data_file,out_file):
    """Annotate feature data with gene information

    Reads in 'feature data' from a tab-delimited input file with feature
    IDs in the first column; outputs these data with data about the
    parent gene appended to each line.

    Arguments:
      gff_lookup         populated GFFAnnotationLookup instance
      feature_data_file  input data file with feature IDs in first column
      out_file           name of output file
    """
    # Read the feature data into a TabFile
    print "Reading in data from %s" % feature_data_file
    feature_data = TabFile.TabFile(filen=feature_data_file,
                                   first_line_is_header=True)

    # Append columns for annotation
    print "Appending columns for annotation"
    for colname in ('exon_parent',
                    'feature_type_exon_parent',
                    'gene_ID',
                    'gene_name',
                    'chr',
                    'start',
                    'end',
                    'strand',
                    'gene_length',
                    'locus',
                    'description'):
        feature_data.appendColumn(colname)

    for line in feature_data:
        feature_ID = line[0]
        annotation = gff_lookup.getAnnotation(feature_ID)
        line['exon_parent'] = annotation.parent_feature_name
        line['feature_type_exon_parent'] = annotation.parent_feature_type
        line['gene_ID'] = annotation.parent_feature_parent
        line['gene_name'] = annotation.parent_gene_name
        line['chr'] = annotation.chr
        line['start'] = annotation.start
        line['end'] = annotation.end
        line['strand'] = annotation.strand
        line['gene_length'] = annotation.gene_length
        line['locus'] = annotation.gene_locus
        line['description'] = annotation.description

    # Output
    print "Writing output file %s" % out_file
    feature_data.write(out_file,include_header=True,no_hash=True)
コード例 #3
0
def annotate_htseq_count_data(gff_lookup,htseq_files,out_file):
    """Annotate count data from htseq-count output with gene information

    Reads in data from one or more htseq-count output files and combines
    into a single tab-delimited output file where the counts for each
    feature have been appended to data about the parent gene.

    Also creates an output 'stats' file which combines the summary data
    from the tail of each htseq-count file.

    Arguments:
      gff_lookup:  populated GFFAnnotationLookup instance
      htseq_files: list of output files from htseq-count to use as input
      out_file:    name of output file
    """
    # Output files
    annotated_counts_out_file = out_file
    tables_out_file = \
        os.path.join(os.path.dirname(annotated_counts_out_file),
                     os.path.splitext(os.path.basename(annotated_counts_out_file))[0]+\
                     "_stats"+os.path.splitext(annotated_counts_out_file)[1])

    # Process the HTSeq-count files
    print "Processing HTSeq-count files"
    htseq_data = {}
    for htseqfile in htseq_files:
        print "\t%s" % htseqfile
        htseq_data[htseqfile] = HTSeqCountFile(htseqfile)

    # Create a TabFile for output
    print "Building annotated count file for output"
    annotated_counts = TabFile.TabFile(column_names=['exon_parent',
                                                     'feature_type_exon_parent',
                                                     'gene_ID',
                                                     'gene_name',
                                                     'chr',
                                                     'start',
                                                     'end',
                                                     'strand',
                                                     'gene_length',
                                                     'locus',
                                                     'description'])
    for htseqfile in htseq_files:
        annotated_counts.appendColumn(htseqfile)

    # Combine feature counts and parent feature data
    for feature_ID in htseq_data[htseq_files[0]].feature_IDs():
        # Get annotation data
        annotation = gff_lookup.getAnnotation(feature_ID)
        # Build the data line
        data = [annotation.parent_feature_name,
                annotation.parent_feature_type,
                annotation.parent_feature_parent,
                annotation.parent_gene_name,
                annotation.chr,
                annotation.start,
                annotation.end,
                annotation.strand,
                annotation.gene_length,
                annotation.gene_locus,
                annotation.description]
        # Add the counts from each file
        for htseqfile in htseq_files:
            data.append(htseq_data[htseqfile].count(feature_ID))
        # Add to the tabfile
        annotated_counts.append(data=data)

    # Write the file
    print "Writing output file %s" % annotated_counts_out_file
    annotated_counts.write(annotated_counts_out_file,include_header=True,no_hash=True)

    # Make second file for the trailing table data
    print "Building trailing tables data file for output"
    table_counts = TabFile.TabFile(column_names=['count'])
    for htseqfile in htseq_files:
        table_counts.appendColumn(htseqfile)
    for name in htseq_data[htseq_files[0]].table():
        # Build the data line
        data = [name]
        for htseqfile in htseq_files:
            data.append(htseq_data[htseqfile].table()[name])
        table_counts.append(data=data)
    print "Writing output file %s" % tables_out_file
    table_counts.write(tables_out_file,include_header=True,no_hash=True)