Beispiel #1
0
    def testParseHmmOutput(self):
        actual_hmmsearch = list(
            hmmer_utils.parse_hmmer_output(_HMMER_TBLOUT, 'PF00131.19'))
        print(actual_hmmsearch)
        expected_hmmsearch = [
            hmmer_utils.HMMEROutput(
                sequence_name='MT4_CANLF/1-62',
                predicted_label='PF00131.19',
                true_label='PF00131.19',
                score=60.9,
                domain_evalue=1.3e-15,
            ),
            hmmer_utils.HMMEROutput(
                sequence_name='E4X7F8_OIKDI/453-561',
                predicted_label='PF00131.19',
                true_label='PF05033.15',
                score=13.9,
                domain_evalue=0.6,
            ),
        ]

        self.assertEqual(actual_hmmsearch, expected_hmmsearch)
        # Test that for the output file with no hits, the sentinel value is written.
        actual_no_output = list(
            hmmer_utils.parse_hmmer_output(_HMMER_TBLOUT_NO_OUTPUT,
                                           'PF00131.19'))
        print(actual_no_output)
        expected_no_output = [
            hmmer_utils.HMMEROutput(
                sequence_name='no_sequence/0-0',
                predicted_label='PF00131.19',
                true_label='PF00000.0',
                score=hmmer_utils.NO_SEQUENCE_MATCH_SCORE_SENTINEL,
                domain_evalue=hmmer_utils.
                NO_SEQUENCE_MATCH_DOMAIN_EVALUE_SENTINEL,
            )
        ]

        self.assertEqual(actual_no_output, expected_no_output)
Beispiel #2
0
  def testFormatAsCsvHmmerOutput(self):
    hmmer_output = hmmer_utils.HMMEROutput(
        sequence_name='MT4_CANLF/1-62',
        predicted_label='PF00131.19',
        true_label='PF12345.6',
        score=60.9,
        domain_evalue=1.3e-15,
    )

    # order should match hmmer_utils.HMMER_OUTPUT_CSV_COLUMN_HEADERS.
    # (That is, util.PREDICTION_FILE_COLUMN_NAMES + [DATAFRAME_SCORE_NAME_KEY].)
    expected = 'MT4_CANLF/1-62,PF12345.6,PF00131.19,60.9,1.3e-15'
    actual = hmmer_output.format_as_csv()

    self.assertEqual(actual, expected)
Beispiel #3
0
class HMMerUtilsTest(parameterized.TestCase):

  def testGetFamilyNameFromUnderscores(self):
    # Sequence name contains underscores.
    actual = hmmer_utils.get_family_name_from('V2R_HUMAN/54-325_PF00001.20')
    expected = 'PF00001.20'
    self.assertEqual(actual, expected)

  def testGetFamilyNameFromNoUnderscores(self):
    # Sequence name has no underscores.
    actual = hmmer_utils.get_family_name_from('Q12345_alanine')
    expected = 'alanine'
    self.assertEqual(actual, expected)

  def testGetSequenceNameFrom(self):
    actual = hmmer_utils.get_sequence_name_from('V2R_HUMAN/54-325_PF00001.20')
    expected = 'V2R_HUMAN/54-325'
    self.assertEqual(actual, expected)

  def testFormatAsCsvHmmerOutput(self):
    hmmer_output = hmmer_utils.HMMEROutput(
        sequence_name='MT4_CANLF/1-62',
        predicted_label='PF00131.19',
        true_label='PF12345.6',
        score=60.9,
        domain_evalue=1.3e-15,
    )

    # order should match hmmer_utils.HMMER_OUTPUT_CSV_COLUMN_HEADERS.
    # (That is, util.PREDICTION_FILE_COLUMN_NAMES + [DATAFRAME_SCORE_NAME_KEY].)
    expected = 'MT4_CANLF/1-62,PF12345.6,PF00131.19,60.9,1.3e-15'
    actual = hmmer_output.format_as_csv()

    self.assertEqual(actual, expected)

  def testParseHmmOutput(self):
    actual_hmmsearch = list(
        hmmer_utils.parse_hmmer_output(_HMMER_TBLOUT, 'PF00131.19'))
    print(actual_hmmsearch)
    expected_hmmsearch = [
        hmmer_utils.HMMEROutput(
            sequence_name='MT4_CANLF/1-62',
            predicted_label='PF00131.19',
            true_label='PF00131.19',
            score=60.9,
            domain_evalue=1.3e-15,
        ),
        hmmer_utils.HMMEROutput(
            sequence_name='E4X7F8_OIKDI/453-561',
            predicted_label='PF00131.19',
            true_label='PF05033.15',
            score=13.9,
            domain_evalue=0.6,
        ),
    ]

    self.assertEqual(actual_hmmsearch, expected_hmmsearch)
    # Test that for the output file with no hits, the sentinel value is written.
    actual_no_output = list(
        hmmer_utils.parse_hmmer_output(_HMMER_TBLOUT_NO_OUTPUT, 'PF00131.19'))
    print(actual_no_output)
    expected_no_output = [
        hmmer_utils.HMMEROutput(
            sequence_name='no_sequence/0-0',
            predicted_label='PF00131.19',
            true_label='PF00000.0',
            score=hmmer_utils.NO_SEQUENCE_MATCH_SCORE_SENTINEL,
            domain_evalue=hmmer_utils.NO_SEQUENCE_MATCH_DOMAIN_EVALUE_SENTINEL,
        )
    ]

    self.assertEqual(actual_no_output, expected_no_output)

  def testFilterFastaFileBySequenceName(self):
    fasta_file_contents = ('>A0A0F7V1V9_TOXGV/243-280_PF10417.9\n'
                           'MIREVEKNGGKQVCPANWRRGEKMMHASFEGVKNYLGQ\n'
                           '>Q1QPP6_NITHX/169-202_PF10417.9\n'
                           'ALQATMSGQKLAPANWQPGETLLLPADEKTQKDT\n'
                           '>Q2K9A0_RHIEC/165-202_PF10417.9\n'
                           'SIQLTAKHQVATPANWNQGEDVIITAAVSNDDAIARFG\n')
    input_fasta_file_name = test_util.tmpfile('input_fasta')
    with tf.io.gfile.GFile(input_fasta_file_name, 'w') as input_fasta_file:
      input_fasta_file.write(fasta_file_contents)

    actual = list(
        hmmer_utils.filter_fasta_file_by_sequence_name(
            input_fasta_file_name, ['Q2K9A0_RHIEC/165-202']))
    expected = [('>Q2K9A0_RHIEC/165-202_PF10417.9\n'
                 'SIQLTAKHQVATPANWNQGEDVIITAAVSNDDAIARFG\n')]
    self.assertEqual(actual, expected)

  def testAllSequenceNamesFromFastaFile(self):
    fasta_file_contents = ('>A0A0F7V1V9_TOXGV/243-280_PF10417.9\n'
                           'MIREVEKNGGKQVCPANWRRGEKMMHASFEGVKNYLGQ\n'
                           '>Q1QPP6_NITHX/169-202_PF10417.9\n'
                           'ALQATMSGQKLAPANWQPGETLLLPADEKTQKDT\n'
                           '>Q2K9A0_RHIEC/165-202_PF10417.9\n'
                           'SIQLTAKHQVATPANWNQGEDVIITAAVSNDDAIARFG\n')
    input_fasta_file_name = test_util.tmpfile('input_fasta')
    with tf.io.gfile.GFile(input_fasta_file_name, 'w') as input_fasta_file:
      input_fasta_file.write(fasta_file_contents)

    actual = hmmer_utils.all_sequence_names_from_fasta_file(
        input_fasta_file_name)
    expected = [
        'A0A0F7V1V9_TOXGV/243-280', 'Q1QPP6_NITHX/169-202',
        'Q2K9A0_RHIEC/165-202'
    ]

    np.testing.assert_array_equal(actual, expected)

  def testSequencesWithNoPrediction(self):
    hmmer_predictions = pd.DataFrame(
        [['A4YXG4_BRASO/106-134', 'PF00001.1', 'PF00001.1', 0.5]],
        columns=util.PREDICTION_FILE_COLUMN_NAMES)
    all_sequence_names = ['A_DIFFERENTSEQNAME/1-2', 'A4YXG4_BRASO/106-134']
    actual = hmmer_utils.sequences_with_no_prediction(
        all_sequence_names=all_sequence_names,
        hmmer_predictions=hmmer_predictions)
    expected = {'A_DIFFERENTSEQNAME/1-2'}

    self.assertEqual(actual, expected)

  def testYieldTopElByScoreForEachSequenceName(self):
    input_df = pd.DataFrame([
        ['SAME_SEQ_NAME', 'PF00264.20', 'PF000264.20', 10.1, 1e-3],
        ['SAME_SEQ_NAME', 'PF00264.20', 'PF00001.3', -65432.0, 100.],
    ],
                            columns=hmmer_utils.HMMER_OUTPUT_CSV_COLUMN_HEADERS)
    actual = pd.concat(
        list(
            hmmer_utils.yield_top_el_by_score_for_each_sequence_name(input_df)))
    expected = pd.DataFrame(
        [['SAME_SEQ_NAME', 'PF00264.20', 'PF000264.20', 10.1, 1e-3]],
        columns=hmmer_utils.HMMER_OUTPUT_CSV_COLUMN_HEADERS)

    self.assertLen(actual, 1)
    self.assertCountEqual(
        actual.to_dict('records'), expected.to_dict('records'))

  # pylint: disable=line-too-long
  # Disable line-too-long because phmmer output strings are long, and we
  # want to paste them verbatim.
  @parameterized.named_parameters(
      dict(
          testcase_name=('multiple seq outputs, no repeats, same families, all '
                         'identifiers have predictions'),
          phmmer_output="""#                                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name                accession  query name                   accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#        ------------------- ----------         -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
OPSB_HUMAN/51-303_PF00001.20 -          OPSB_HUMAN/51-303_PF00001.20 -            2.5e-61  193.4  64.0   1.6e-22   66.2   0.9   8.7   8   1   0   8   8   8   8 -
OPS3_DROME/75-338_PF00001.20 -          OPS3_DROME/75-338_PF00001.20 -            2.1e-69  219.9  65.6   3.2e-21   62.0   0.3   9.8  10   1   0  10  10  10  10 -
#
# Program:         phmmer
# Version:         3.1b2 (February 2015)
# Pipeline mode:   SEARCH
# Query file:      /storage/hmm_train/PF00001.20.fasta
# Target file:     /storage/hmm_train/PF00001.20.fasta
# Option settings: phmmer -o /dev/null --tblout /dev/stdout -E 10 /storage/hmm_train/PF00001.20.fasta /storage/hmm_train/PF00001.20.fasta
# Date:            Wed Nov 21 09:26:50 2018
# [ok]
""",
          all_identifiers=[
              'OPSB_HUMAN/51-303_PF00001.20',
              'OPS3_DROME/75-338_PF00001.20',
          ],
          expected=[
              hmmer_utils.HMMEROutput(
                  sequence_name='OPSB_HUMAN/51-303',
                  true_label='PF00001.20',
                  predicted_label='PF00001.20',
                  score=193.4,
                  domain_evalue=2.5e-61,
              ),
              hmmer_utils.HMMEROutput(
                  sequence_name='OPS3_DROME/75-338',
                  true_label='PF00001.20',
                  predicted_label='PF00001.20',
                  score=219.9,
                  domain_evalue=2.1e-69,
              )
          ],
      ),
      dict(
          testcase_name=('one seq output, no repeats, same families, some '
                         'identifiers do not have predictions'),
          phmmer_output="""#                                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name                accession  query name                   accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#        ------------------- ----------         -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
OPSB_HUMAN/51-303_PF00001.20 -          OPSB_HUMAN/51-303_PF00001.20 -            2.5e-61  193.4  64.0   1.6e-22   66.2   0.9   8.7   8   1   0   8   8   8   8 -
#
# Program:         phmmer
# Version:         3.1b2 (February 2015)
# Pipeline mode:   SEARCH
# Query file:      /storage/hmm_train/PF00001.20.fasta
# Target file:     /storage/hmm_train/PF00001.20.fasta
# Option settings: phmmer -o /dev/null --tblout /dev/stdout -E 10 /storage/hmm_train/PF00001.20.fasta /storage/hmm_train/PF00001.20.fasta
# Date:            Wed Nov 21 09:26:50 2018
# [ok]
""",
          expected=[
              hmmer_utils.HMMEROutput(
                  sequence_name='OPSB_HUMAN/51-303',
                  true_label='PF00001.20',
                  predicted_label='PF00001.20',
                  score=193.4,
                  domain_evalue=2.5e-61,
              ),
              hmmer_utils.HMMEROutput(
                  sequence_name='THIS_ISNOTSEEN/1-111',
                  true_label='PF09876.5',
                  predicted_label='PF00000.0',
                  score=hmmer_utils.NO_SEQUENCE_MATCH_SCORE_SENTINEL,
                  domain_evalue=hmmer_utils.NO_SEQUENCE_MATCH_DOMAIN_EVALUE_SENTINEL,
              )
          ],
          all_identifiers=[
              'OPSB_HUMAN/51-303_PF00001.20',
              'THIS_ISNOTSEEN/1-111_PF09876.5',
          ]),
      dict(
          testcase_name=('one seq output, with repeats, all identifiers have '
                         'predictions'),
          phmmer_output="""#                                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name                accession  query name                   accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#        ------------------- ----------         -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
SSR1_HUMAN/75-323_PF00001.20  -          CXCR5_HUMAN/68-322_PF00001.20 -              1e-06   14.4   0.9     1e-06   14.4   0.9   2.0   2   0   0   2   2   2   2 -
CX3C1_RAT/49-294_PF00001.20   -          CXCR5_HUMAN/68-322_PF00001.20 -            1.3e-06   14.1   8.7   4.5e-06   12.3   1.7   2.6   3   0   0   3   3   3   2 -

#
# Program:         phmmer
# Version:         3.1b2 (February 2015)
# Pipeline mode:   SEARCH
# Query file:      /storage/hmm_train/PF00001.20.fasta
# Target file:     /storage/hmm_train/PF00001.20.fasta
# Option settings: phmmer -o /dev/null --tblout /dev/stdout -E 10 /storage/hmm_train/PF00001.20.fasta /storage/hmm_train/PF00001.20.fasta
# Date:            Wed Nov 21 09:26:50 2018
# [ok]
""",
          expected=[
              hmmer_utils.HMMEROutput(
                  sequence_name='CXCR5_HUMAN/68-322',
                  true_label='PF00001.20',
                  predicted_label='PF00001.20',
                  score=14.4,
                  domain_evalue=1e-6,
              ),
              hmmer_utils.HMMEROutput(
                  sequence_name='CXCR5_HUMAN/68-322',
                  true_label='PF00001.20',
                  predicted_label='PF00001.20',
                  score=14.1,
                  domain_evalue=1.3e-6,
              )
          ],
          all_identifiers=[
              'CXCR5_HUMAN/68-322_PF00001.20',
          ]),
      dict(
          testcase_name=('no hits for any input sequence'),
          phmmer_output="""
#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
#
# Program:         phmmer
# Version:         3.1b2 (February 2015)
# Pipeline mode:   SEARCH
# Query file:      -
# Target file:     all_trainseqs.fasta
# Option settings: phmmer -o /dev/null --tblout /dev/stdout -E 10.0 - all_trainseqs.fasta
# Date:            Mon Oct 29 12:45:57 2018
# [ok]
""",
          all_identifiers=[
              'CXCR5_HUMAN/68-322_PF00001.20',
          ],
          expected=[
              hmmer_utils.HMMEROutput(
                  sequence_name='CXCR5_HUMAN/68-322',
                  true_label='PF00001.20',
                  predicted_label=hmmer_utils.NO_SEQUENCE_MATCH_FAMILY_NAME_SENTINEL,
                  score=hmmer_utils.NO_SEQUENCE_MATCH_SCORE_SENTINEL,
                  domain_evalue=hmmer_utils.NO_SEQUENCE_MATCH_DOMAIN_EVALUE_SENTINEL,
              ),
          ],
      ),
  )
  def testParsePhmmer(self, phmmer_output, all_identifiers, expected):
    actual = hmmer_utils.parse_phmmer_output(phmmer_output, all_identifiers)
    self.assertEqual(actual, expected)