コード例 #1
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_append_line(self):
     """Append a line to a file
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True,delimiter=',')
     line = 'chr3,10,9,8'
     tabfile.append(tabdata=line)
     self.assertEqual(str(tabfile[-1]),line)
コード例 #2
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_transpose_tab_file(self):
     """Test transposing TabFile
     """
     tabfile1 = TabFile('test',self.fp,first_line_is_header=False)
     tabfile2 = tabfile1.transpose()
     self.assertEqual(len(tabfile1),tabfile2.nColumns())
     self.assertEqual(len(tabfile2),tabfile1.nColumns())
コード例 #3
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_append_line_as_data(self):
     """Append a line to a file with data supplied as a list
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True,delimiter=',')
     data = ['chr3','10','9','8']
     tabfile.append(data=data)
     self.assertEqual(str(tabfile[-1]),','.join([str(x) for x in data]))
コード例 #4
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_write_data_with_header(self):
     """Write data to file-like object including a header line
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True,delimiter=',')
     fp = cStringIO.StringIO()
     tabfile.write(fp=fp,include_header=True)
     self.assertEqual(fp.getvalue(),self.header.replace('\t',',')+self.data.replace('\t',','))
     fp.close()
コード例 #5
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_add_tab_data_to_new_tabfile(self):
     """Test adding data as a tab-delimited line to a new empty TabFile
     """
     data = 'chr1\t10000\t20000\t+'
     tabfile = TabFile()
     tabfile.append(tabdata=data)
     self.assertEqual(len(tabfile),1,"TabFile should now have one line")
     self.assertEqual(str(tabfile[0]),data)
コード例 #6
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_reverse_sort_on_column(self):
     """Sort data on a numerical column into (reverse) descending order
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     tabfile.sort(lambda line: line['data'],reverse=True)
     sorted_data = [6.8,5.7,3.4]
     for i in range(len(tabfile)):
         self.assertEqual(tabfile[i]['data'],sorted_data[i])
コード例 #7
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_write_data(self):
     """Write data to file-like object
     """
     tabfile = TabFile('test',self.fp)
     fp = cStringIO.StringIO()
     tabfile.write(fp=fp)
     self.assertEqual(fp.getvalue(),self.data)
     fp.close()
コード例 #8
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_add_data_to_new_tabfile(self):
     """Test adding data as a list of items to a new empty TabFile
     """
     data = ['chr1',10000,20000,'+']
     tabfile = TabFile()
     tabfile.append(data=data)
     self.assertEqual(len(tabfile),1,"TabFile should now have one line")
     for i in range(len(data)):
         self.assertEqual(tabfile[0][i],data[i])
コード例 #9
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_unexpected_uncommented_header(self):
     """Test reading in a tab file with an unexpected uncommented header
     """
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),4,"Input has 4 lines of data")
     self.assertEqual(tabfile.header(),[],"Wrong header")
     self.assertEqual(str(tabfile[0]),"chr\tstart\tend\tdata","Incorrect string representation")
     self.assertRaises(KeyError,tabfile[3].__getitem__,'chr')
     self.assertEqual(tabfile.nColumns(),4)
コード例 #10
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_expected_uncommented_header(self):
     """Test reading in a tab file with an expected uncommented header
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     self.assertEqual(len(tabfile),3,"Input has 3 lines of data")
     self.assertEqual(tabfile.header(),['chr','start','end','data'],"Wrong header")
     self.assertEqual(str(tabfile[0]),"chr1\t1\t234\t4.6","Incorrect string representation")
     self.assertEqual(tabfile[2]['chr'],'chr2',"Incorrect data")
     self.assertEqual(tabfile.nColumns(),4)
コード例 #11
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_load_data_with_header(self):
     """Create and load Tabfile using first line as header
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     self.assertEqual(len(tabfile),3,"Input has 3 lines of data")
     self.assertEqual(tabfile.header(),['chr','start','end','data'],"Wrong header")
     self.assertEqual(str(tabfile[0]),"chr1\t1\t234\t4.6","Incorrect string representation")
     self.assertEqual(tabfile[2]['chr'],'chr2',"Incorrect data")
     self.assertEqual(tabfile.nColumns(),4)
コード例 #12
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_load_data(self):
     """Create and load new TabFile instance
     """
     tabfile = TabFile('test',self.fp,delimiter=',')
     self.assertEqual(len(tabfile),3,"Input has 3 lines of data")
     self.assertEqual(tabfile.header(),[],"Header should be empty")
     self.assertEqual(str(tabfile[0]),"chr1,1,234,4.6","Incorrect string representation")
     self.assertEqual(tabfile[2][0],'chr2',"Incorrect data")
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.filename(),'test')
コード例 #13
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_insert_tab_data_line(self):
     """Insert a TabDataLine into a TabFile
     """
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),3)
     tabdataline = TabDataLine('chr1\t10000\t20000\t+')
     line = tabfile.insert(2,tabdataline=tabdataline)
     self.assertEqual(len(tabfile),4)
     # Check new line is correct
     self.assertTrue(line is tabdataline)
コード例 #14
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_insert_line_with_tab_data(self):
     """Insert line into a TabFile populated from tabbed data
     """
     data = 'chr1\t10000\t20000\t+'
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),3)
     line = tabfile.insert(2,tabdata=data)
     self.assertEqual(len(tabfile),4)
     # Check new line is correct
     self.assertTrue(str(line) == data)
コード例 #15
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_insert_empty_line(self):
     """Insert a blank line into a TabFile
     """
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),3)
     line = tabfile.insert(2)
     self.assertEqual(len(tabfile),4)
     # Check new line is empty
     for i in range(len(line)):
         self.assertTrue(str(line[i]) == '')
コード例 #16
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_append_tab_data_line(self):
     """Append a TabDataLine to a TabFile
     """
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),3)
     tabdataline = TabDataLine('chr1\t10000\t20000\t+')
     line = tabfile.append(tabdataline=tabdataline)
     self.assertEqual(len(tabfile),4)
     # Check new line is correct
     self.assertTrue(line is tabdataline)
コード例 #17
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_append_empty_line(self):
     """Append a blank line to a TabFile
     """
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),3)
     line = tabfile.append()
     self.assertEqual(len(tabfile),4)
     # Check new line is empty
     for i in range(len(line)):
         self.assertTrue(str(line[i]) == '')
コード例 #18
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_load_data_setting_explicit_header(self):
     """Create and load TabFile setting the header explicitly
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True,
                       column_names=('CHROM','START','STOP','VALUES'))
     self.assertEqual(len(tabfile),3,"Input has 3 lines of data")
     self.assertEqual(tabfile.header(),['CHROM','START','STOP','VALUES'],"Wrong header")
     self.assertEqual(str(tabfile[0]),"chr1\t1\t234\t4.6","Incorrect string representation")
     self.assertEqual(tabfile[2]['CHROM'],'chr2',"Incorrect data")
     self.assertEqual(tabfile.nColumns(),4)
コード例 #19
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_insert_line_with_data(self):
     """Insert line into a TabFile populated with data
     """
     data = ['chr1',678,901,6.1]
     tabfile = TabFile('test',self.fp)
     self.assertEqual(len(tabfile),3)
     line = tabfile.insert(2,data=data)
     self.assertEqual(len(tabfile),4)
     # Check new line is correct
     for i in range(len(data)):
         self.assertTrue(line[i] == data[i])
コード例 #20
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_apply_operation_to_column(self):
     """Divide values in a column by 10
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Check number of columns and header items
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     # Divide data column by 10
     tabfile.transformColumn('data',lambda x: x/10)
     results = [0.46,0.57,0.68]
     for i in range(len(tabfile)):
         self.assertEqual(tabfile[i]['data'],results[i])
コード例 #21
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_get_index_for_line_number(self):
     """Look up line numbers from a TabFile
     """
     tabfile = TabFile('test',self.fp)
     # Look for an existing line
     self.assertEqual(tabfile.indexByLineNumber(2),0)
     self.assertEqual(tabfile[tabfile.indexByLineNumber(2)].lineno(),2)
     # Look for the first line in the file (the commented header)
     self.assertRaises(IndexError,tabfile.indexByLineNumber,1)
     # Look for a generally non-existant line number
     self.assertRaises(IndexError,tabfile.indexByLineNumber,-12)
     # Look for a negative line number
     self.assertRaises(IndexError,tabfile.indexByLineNumber,99)
コード例 #22
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_change_delimiter_for_write(self):
     """Write data out with different delimiter to input
     """
     tabfile = TabFile('test',self.fp,delimiter=',')
     # Modified delimiter (tab)
     fp = cStringIO.StringIO()
     tabfile.write(fp=fp,delimiter='\t')
     self.assertEqual(fp.getvalue(),self.data)
     fp.close()
     # Default (should revert to comma)
     fp = cStringIO.StringIO()
     tabfile.write(fp=fp)
     self.assertEqual(fp.getvalue(),self.data.replace('\t',','))
     fp.close()
コード例 #23
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_lookup(self):
     """Look up data from a TabFile
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Look for lines with 'chr1' in the chr column
     matching = tabfile.lookup('chr','chr1')
     self.assertEqual(len(matching),2)
     for m in matching:
         self.assertEqual(m['chr'],'chr1',"Lookup returned bad match: '%s'" % m)
     self.assertNotEqual(matching[0],matching[1])
     # Look for lines with 'chr2' in the chr column
     matching = tabfile.lookup('chr','chr2')
     self.assertEqual(len(matching),1)
     self.assertEqual(matching[0]['chr'],'chr2',"Lookup returned bad match: '%s'" % m)
     # Look for lines with 'bananas' in the chr column
     self.assertEqual(len(tabfile.lookup('chr','bananas')),0)
コード例 #24
0
ファイル: IlluminaData.py プロジェクト: fw1121/genomics
    def _load_data(self, fp):
        """Internal: populate with data from external file

        Arguments
          fp: file-like object opened for reading which contains
             sample sheet data

        """
        section = None
        for i, line in enumerate(fp):
            line = line.rstrip()
            logging.debug(line)
            if not line:
                # Skip blank lines
                continue
            if line.startswith('['):
                # New section
                try:
                    i = line.index(']')
                    section = line[1:i]
                    continue
                except ValueError:
                    logging.error("Bad line (#%d): %s" % (i + 1, line))
            if section == 'Header':
                # Header lines are comma-separated PARAM,VALUE lines
                self._set_param_value(line, self._header)
            elif section == 'Reads':
                # Read lines are one value per line
                value = line.rstrip(',')
                if value:
                    self._reads.append(value)
            elif section == 'Settings':
                # Settings lines are comma-separated PARAM,VALUE lines
                self._set_param_value(line, self._settings)
            elif section == 'Data':
                # Store data in TabFile object
                if self._data is None:
                    # Initialise TabFile using this first line
                    # to set the header
                    self._data = TabFile.TabFile(column_names=line.split(','),
                                                 delimiter=',')
                else:
                    self._data.append(tabdata=line)
            elif section is None:
                raise IlluminaDataError("Not a valid IEM sample sheet?")
            else:
                raise IlluminaDataError(
                    "Unrecognised section '%s': not a valid IEM sample sheet?"
                    % section)
        # Clean up data items: remove surrounding whitespace
        if self._data is not None:
            for line in self._data:
                for item in self._data.header():
                    try:
                        line[item] = line[item].strip()
                    except AttributeError:
                        pass
コード例 #25
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_set_column_to_constant_value(self):
     """Set a column to a constant value using transformColumn
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Check number of columns and header items
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     # Add a strand column
     tabfile.appendColumn('strand')
     self.assertEqual(tabfile.nColumns(),5)
     self.assertEqual(tabfile.header(),['chr','start','end','data','strand'])
     # Set all values to '+'
     tabfile.transformColumn('strand',lambda x: '+')
     for line in tabfile:
         self.assertEqual(line['strand'],'+')
コード例 #26
0
def annotate_feature_data(gff_lookup,feature_data_file,out_file):
    """Annotate feature data with gene information

    Reads in 'feature data' from a tab-delimited input file with feature
    IDs in the first column; outputs these data with data about the
    parent gene appended to each line.

    Arguments:
      gff_lookup         populated GFFAnnotationLookup instance
      feature_data_file  input data file with feature IDs in first column
      out_file           name of output file
    """
    # Read the feature data into a TabFile
    print "Reading in data from %s" % feature_data_file
    feature_data = TabFile.TabFile(filen=feature_data_file,
                                   first_line_is_header=True)

    # Append columns for annotation
    print "Appending columns for annotation"
    for colname in ('exon_parent',
                    'feature_type_exon_parent',
                    'gene_ID',
                    'gene_name',
                    'chr',
                    'start',
                    'end',
                    'strand',
                    'gene_length',
                    'locus',
                    'description'):
        feature_data.appendColumn(colname)

    for line in feature_data:
        feature_ID = line[0]
        annotation = gff_lookup.getAnnotation(feature_ID)
        line['exon_parent'] = annotation.parent_feature_name
        line['feature_type_exon_parent'] = annotation.parent_feature_type
        line['gene_ID'] = annotation.parent_feature_parent
        line['gene_name'] = annotation.parent_gene_name
        line['chr'] = annotation.chr
        line['start'] = annotation.start
        line['end'] = annotation.end
        line['strand'] = annotation.strand
        line['gene_length'] = annotation.gene_length
        line['locus'] = annotation.gene_locus
        line['description'] = annotation.description

    # Output
    print "Writing output file %s" % out_file
    feature_data.write(out_file,include_header=True,no_hash=True)
コード例 #27
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_append_column(self):
     """Append new column to a Tabfile
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     self.assertEqual(len(tabfile.header()),4)
     tabfile.appendColumn('new')
     self.assertEqual(len(tabfile.header()),5)
     self.assertEqual(tabfile.header()[4],'new')
     self.assertEqual(tabfile[0]['new'],'')
コード例 #28
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_compute_and_overwrite_existing_column_integer_index(self):
     """Compute new values for an existing column referenced using integer index
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Check number of columns and header items
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     # Compute new values for data column
     tabfile.computeColumn(3,lambda line: line['end'] - line['start'])
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     results = [233,323,4444]
     for i in range(len(tabfile)):
         self.assertEqual(tabfile[i]['data'],results[i])
コード例 #29
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_compute_midpoint(self):
     """Compute the midpoint of the start and end columns
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Check number of columns and header items
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     # Compute midpoint of start and end
     tabfile.computeColumn('midpoint',lambda line: (line['end'] + line['start'])/2.0)
     self.assertEqual(tabfile.nColumns(),5)
     self.assertEqual(tabfile.header(),['chr','start','end','data','midpoint'])
     results = [117.5,728.5,3456]
     for i in range(len(tabfile)):
         self.assertEqual(tabfile[i]['midpoint'],results[i])
コード例 #30
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_reorder_columns(self):
     """Reorder columns in a TabFile
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Check number of columns and header items
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     # Reorder
     new_columns = ['chr','data','start','end']
     tabfile = tabfile.reorderColumns(new_columns)
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),new_columns)
     self.assertEqual(str(tabfile[0]),"chr1\t4.6\t1\t234")
     self.assertEqual(str(tabfile[1]),"chr1\t5.7\t567\t890")
     self.assertEqual(str(tabfile[2]),"chr2\t6.8\t1234\t5678")
コード例 #31
0
ファイル: test_TabFile.py プロジェクト: gwmei/genomics
 def test_reorder_columns_empty_cells(self):
     """Reorder columns where some lines have empty cells at the start
     """
     tabfile = TabFile('test',self.fp,first_line_is_header=True)
     # Check number of columns and header items
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),['chr','start','end','data'])
     # Reset some cells to empty
     tabfile[0]['chr'] = ''
     tabfile[2]['chr'] = ''
     # Reorder
     new_columns = ['chr','data','start','end']
     tabfile = tabfile.reorderColumns(new_columns)
     self.assertEqual(tabfile.nColumns(),4)
     self.assertEqual(tabfile.header(),new_columns)
     self.assertEqual(str(tabfile[0]),"\t4.6\t1\t234")
     self.assertEqual(str(tabfile[1]),"chr1\t5.7\t567\t890")
     self.assertEqual(str(tabfile[2]),"\t6.8\t1234\t5678")
コード例 #32
0
def annotate_htseq_count_data(gff_lookup,htseq_files,out_file):
    """Annotate count data from htseq-count output with gene information

    Reads in data from one or more htseq-count output files and combines
    into a single tab-delimited output file where the counts for each
    feature have been appended to data about the parent gene.

    Also creates an output 'stats' file which combines the summary data
    from the tail of each htseq-count file.

    Arguments:
      gff_lookup:  populated GFFAnnotationLookup instance
      htseq_files: list of output files from htseq-count to use as input
      out_file:    name of output file
    """
    # Output files
    annotated_counts_out_file = out_file
    tables_out_file = \
        os.path.join(os.path.dirname(annotated_counts_out_file),
                     os.path.splitext(os.path.basename(annotated_counts_out_file))[0]+\
                     "_stats"+os.path.splitext(annotated_counts_out_file)[1])

    # Process the HTSeq-count files
    print "Processing HTSeq-count files"
    htseq_data = {}
    for htseqfile in htseq_files:
        print "\t%s" % htseqfile
        htseq_data[htseqfile] = HTSeqCountFile(htseqfile)

    # Create a TabFile for output
    print "Building annotated count file for output"
    annotated_counts = TabFile.TabFile(column_names=['exon_parent',
                                                     'feature_type_exon_parent',
                                                     'gene_ID',
                                                     'gene_name',
                                                     'chr',
                                                     'start',
                                                     'end',
                                                     'strand',
                                                     'gene_length',
                                                     'locus',
                                                     'description'])
    for htseqfile in htseq_files:
        annotated_counts.appendColumn(htseqfile)

    # Combine feature counts and parent feature data
    for feature_ID in htseq_data[htseq_files[0]].feature_IDs():
        # Get annotation data
        annotation = gff_lookup.getAnnotation(feature_ID)
        # Build the data line
        data = [annotation.parent_feature_name,
                annotation.parent_feature_type,
                annotation.parent_feature_parent,
                annotation.parent_gene_name,
                annotation.chr,
                annotation.start,
                annotation.end,
                annotation.strand,
                annotation.gene_length,
                annotation.gene_locus,
                annotation.description]
        # Add the counts from each file
        for htseqfile in htseq_files:
            data.append(htseq_data[htseqfile].count(feature_ID))
        # Add to the tabfile
        annotated_counts.append(data=data)

    # Write the file
    print "Writing output file %s" % annotated_counts_out_file
    annotated_counts.write(annotated_counts_out_file,include_header=True,no_hash=True)

    # Make second file for the trailing table data
    print "Building trailing tables data file for output"
    table_counts = TabFile.TabFile(column_names=['count'])
    for htseqfile in htseq_files:
        table_counts.appendColumn(htseqfile)
    for name in htseq_data[htseq_files[0]].table():
        # Build the data line
        data = [name]
        for htseqfile in htseq_files:
            data.append(htseq_data[htseqfile].table()[name])
        table_counts.append(data=data)
    print "Writing output file %s" % tables_out_file
    table_counts.write(tables_out_file,include_header=True,no_hash=True)