Example #1
0
 def test_default_x_test(self):
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file)
     correct = [str(x) for x in range(self.length) if (x % 2) == 1]
     val = False
     if np.array_equal(np.unique(x_test, return_counts=True),
                       np.unique(correct, return_counts=True)):
         val = True
     self.assertTrue(val)
Example #2
0
 def test_ova_x_train(self):
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file,
                                                         one_vs_all='a')
     correct = [str(x) for x in range(self.length) if (x % 2) == 0]
     val = False
     if np.array_equal(np.unique(x_train, return_counts=True),
                       np.unique(correct, return_counts=True)):
         val = True
     self.assertTrue(val, msg='\n' + str(correct) + '\n' + str(x_train))
Example #3
0
 def test_default_y_train(self):
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file)
     correct = [
         self.classifications[x % self.classes] for x in range(self.length)
         if (x % 2) == 0
     ]
     val = False
     if np.array_equal(np.unique(y_train, return_counts=True),
                       np.unique(correct, return_counts=True)):
         val = True
     self.assertTrue(val)
Example #4
0
 def test_blacklist(self):
     bl = np.unique(
         np.random.randint(self.length, size=(int(0.1 * self.length))))
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file,
                                                         blacklist=bl)
     count1 = 0
     count2 = 0
     for elem in bl:
         count2 += 1
         if elem not in x_train and elem not in x_test:
             count1 += 1
     self.assertEqual(count1, count2)
Example #5
0
 def test_removed_x_test(self):
     if self.classes <= 2:
         return unittest.skip('Too few classes')
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file,
                                                         remove='a')
     correct = [str(x) for x in range(self.length) if (x % 2) == 1]
     correct = [x for x in correct if int(x) % self.classes != 0]
     val = False
     if np.array_equal(np.unique(x_test, return_counts=True),
                       np.unique(correct, return_counts=True)):
         val = True
     self.assertTrue(val)
Example #6
0
def get_roary_from_list(kwargs=None,
                        roary_sheet=constants.ROARY,
                        gene_header='Gene',
                        valid_header='Valid',
                        valid_features_table=constants.ROARY_VALID):
    """
    Gets the Roary data from roary_sheet for the genomes specified by kwargs,
    uses utils.parse_metadata. Does initial feature selection by removing
    features who are not labeled as valid in valid_features_table.

    Args:
        kwargs (dict):              The arguments to pass to parse_metadata.
        roary_sheet (str):          File containing Roary data.
        gene_header (str):          Header for the column that contains the
                                    gene names.
        valid_header (str):         Header for the column that contains T/F
                                    values determining if a gene is valid.
        valid_features_table (str): csv table containing a list of valid and
                                    invalid genes.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}

    (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs)

    test_files = [str(x) for x in x_test]

    roary_data = pd.read_csv(roary_sheet)
    valid_features = pd.read_csv(valid_features_table)
    features = list(valid_features[valid_header])
    roary_data = roary_data[roary_data[gene_header].isin(features)]

    valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)]
    x_test = [x_test[x] for x in valid_cols]
    if list(y_test):
        y_test = [y_test[x] for x in valid_cols]

    x_train = roary_data[x_train].T.values
    x_test = roary_data[x_test].T.values

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, features, test_files, le)
Example #7
0
 def test_ova_y_test(self):
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file,
                                                         one_vs_all='a')
     correct = [
         self.classifications[x % self.classes] for x in range(self.length)
         if (x % 2) == 1
     ]
     correct = [x if x == 'a' else 'Other' for x in correct]
     val = False
     if np.array_equal(np.unique(y_test, return_counts=True),
                       np.unique(correct, return_counts=True)):
         val = True
     self.assertTrue(val)
Example #8
0
def get_omnilog_data(metadata_kwargs=None,
                     omnilog_sheet=constants.OMNILOG_DATA,
                     validate=True):
    """
    Gets the omnilog data contained in omnilog_sheet for the genomes specified
    by kwargs. Uses utils.parse_metadata

    Args:
        kwargs (dict):       The arguments to pass to parse_metadata.
        omnilog_sheet (str): File containing omnilog data.
        validate (bool):     If True y_test is created, if False y_test is an
                             empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder

    """
    metadata_kwargs = metadata_kwargs or {}
    metadata_kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_metadata(**metadata_kwargs)

    test_files = [str(x) for x in x_test]

    omnilog_data = pd.read_csv(omnilog_sheet, index_col=0)
    valid_cols = [x_train.index(x) for x in x_train if x in list(omnilog_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(omnilog_data)]
    x_test = [x_test[x] for x in valid_cols]
    if validate:
        y_test = [y_test[x] for x in valid_cols]

    feature_names = omnilog_data.index

    output_data = []
    x_train = omnilog_data[x_train].T.values
    x_test = omnilog_data[x_test].T.values

    imputer = Imputer()
    x_train = imputer.fit_transform(x_train)
    x_test = imputer.transform(x_test)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return output_data, feature_names, test_files, le
Example #9
0
 def test_default_test(self):
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file)
     count1 = 0
     count2 = 0
     correct_x = [str(x) for x in range(self.length) if (x % 2) == 1]
     correct_y = [
         self.classifications[x % self.classes] for x in range(self.length)
         if (x % 2) == 1
     ]
     for elem in x_test:
         index = correct_x.index(elem)
         if y_test[count2] == correct_y[index]:
             count1 += 1
         count2 += 1
     self.assertEqual(count1, count2)
Example #10
0
def get_genome_regions(kwargs=None,
                       table=constants.GENOME_REGION_TABLE,
                       sep=None,
                       validate=True):
    """
    Gets genome region presence absence data from a binary table output by
    Panseq for the genomes specified by kwargs. Uses utils.parse_metadata

    Args:
        kwargs (dict):      The arguments to pass to parse_metadata.
        table (str):        binary_table.txt output from panseq.
        sep (str or None):  The separator used in table.
        validate (bool):    If True y_test is created, if False y_test is
                            an empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (train_label, y_train, test_label, y_test) = parse_metadata(**kwargs)

    x_train = []
    x_test = []
    if sep is None:
        data = pd.read_csv(table, sep=sep, engine='python', index_col=0)
    else:
        data = pd.read_csv(table, sep=sep, index_col=0)

    for header in train_label:
        x_train.append(data[header].tolist())

    for header in test_label:
        x_test.append(data[header].tolist())

    x_train = np.asarray(x_train)
    x_test = np.asarray(x_test)

    feature_names = np.asarray(data.index)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_label, le)
Example #11
0
 def test_ova_test(self):
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file,
                                                         one_vs_all='a')
     correct_x = [str(x) for x in range(self.length) if (x % 2) == 1]
     correct_y = [
         self.classifications[x % self.classes] for x in range(self.length)
         if (x % 2) == 1
     ]
     correct_y = [x if x == 'a' else 'Other' for x in correct_y]
     count1 = 0
     count2 = 0
     for elem in x_test:
         index = correct_x.index(elem)
         if y_test[count2] == correct_y[index]:
             count1 += 1
         count2 += 1
     self.assertEqual(count1, count2)
Example #12
0
def get_roary_data(kwargs=None, roary_sheet=constants.ROARY, validate=True):
    """
    Get the Roary data from roary_sheet for the genomes specified by kwargs,
    uses utils.parse_metadata.

    Args:
        kwargs (dict):      The arguments to pass to parse_metadata.
        roary_sheet (str):  File containing Roary data.
        validate (bool):    If True y_test is created, if False y_test is an
                            empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs)

    test_files = [str(x) for x in x_test]

    roary_data = pd.read_csv(roary_sheet, index_col=0)

    feature_names = roary_data.index

    valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)]
    x_test = [x_test[x] for x in valid_cols]
    y_test = [y_test[x] for x in valid_cols]

    x_train = roary_data[x_train].T.values
    x_test = roary_data[x_test].T.values

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
Example #13
0
 def test_removed_test(self):
     if self.classes <= 2:
         return unittest.skip('Too few classes')
     (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file,
                                                         remove='a')
     correct_x = [str(x) for x in range(self.length) if (x % 2) == 1]
     correct_x = [x for x in correct_x if int(x) % self.classes != 0]
     correct_y = [
         self.classifications[x % self.classes] for x in range(self.length)
         if (x % 2) == 1
     ]
     correct_y = [x for x in correct_y if x != 'a']
     count1 = 0
     count2 = 0
     for elem in x_test:
         index = correct_x.index(elem)
         if y_test[count2] == correct_y[index]:
             count1 += 1
         count2 += 1
     self.assertEqual(count1, count2)
Example #14
0
def get_filtered_roary_data(kwargs=None,
                            roary_sheet=constants.ROARY,
                            limit=10,
                            validate=True):
    """
    Gets the Roary data from roary_sheet for the genomes specified by kwargs,
    uses utils.parse_metadata. Does initial feature selection by removing
    features whose in proportion between classes is less than limit, based on
    the feature selection done by Lupolova et. al.

    Args:
        kwargs (dict):      The arguments to pass to parse_metadata.
        roary_sheet (str):  File containing Roary data.
        limit (int):        Value used to determine which features are removed
        validate (bool):    If True y_test is created, if False y_test is an
                            empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs)

    test_files = [str(x) for x in x_test]

    roary_data = pd.read_csv(roary_sheet, index_col=0)

    class_labels = np.unique(y_train)
    classes = []
    for c in class_labels:
        class_members = [x for x in x_train if y_train[x_train.index(x)] == c]
        classes.append(roary_data[class_members].mean(axis=1) * 100)

    proportions = pd.concat(classes, axis=1)
    diffs = np.diff(proportions.values, axis=1)
    diffs = np.absolute(diffs.mean(axis=1))
    idx = list(proportions.index)
    col = ['Diff']
    avg_diff = pd.DataFrame(diffs, index=idx, columns=col)
    invalid = list(avg_diff[avg_diff['Diff'] < limit].index)
    roary_data = roary_data.drop(invalid)

    feature_names = roary_data.index

    valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)]
    x_test = [x_test[x] for x in valid_cols]
    if validate:
        y_test = [y_test[x] for x in valid_cols]

    x_train = roary_data[x_train].T.values
    x_test = roary_data[x_test].T.values

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
Example #15
0
def get_kmer(metadata_kwargs=None,
             kmer_kwargs=None,
             recount=False,
             database=constants.DEFAULT_DB,
             validate=True,
             complete_count=True):
    """
    Get kmer data for genomes specified in kwargs, uses kmer_counter and
    utils.parse_metadata

    Args:
        kwargs (dict):   The arguments to pass to parse_metadata
        database (str):  lmdb database to store kmer counts
        recount (bool):  If True the kmers are recounted
        k (int):         Size of kmer to be counted. Ignored if recount is
                         false
        L (int):         kmer cutoff value. Ignored if recount is false
        validate (bool): If True y_test is created, if False y_test is
                         an empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    if complete_count:
        counter = complete_kmer_counter
    else:
        counter = kmer_counter

    metadata_kwargs = metadata_kwargs or {}
    metadata_kwargs['validate'] = validate
    kmer_kwargs = kmer_kwargs or {}

    if 'name' in kmer_kwargs:
        name = kmer_kwargs['name']
    else:
        name = constants.DEFAULT_NAME
    if 'output_db' in kmer_kwargs:
        output_db = kmer_kwargs['output_db']
    else:
        output_db = database

    (x_train, y_train, x_test, y_test) = parse_metadata(**metadata_kwargs)

    test_files = [str(x) for x in x_test]
    all_files = x_train + x_test

    if recount:
        counter.count_kmers(all_files, database, **kmer_kwargs, force=True)
    else:
        try:
            temp = counter.get_counts(x_train, output_db, name)
        except KmerCounterError as e:
            msg = 'Warning: get_counts failed, attempting a recount'
            logging.exception(msg)
            counter.count_kmers(all_files, database, **kmer_kwargs)

    x_train = counter.get_counts(x_train, output_db, name)
    x_test = counter.get_counts(x_test, output_db, name)

    feature_names = counter.get_kmer_names(output_db, name)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
Example #16
0
def get_genome_custom_filtered(input_table=constants.GENOME_REGION_TABLE,
                               filter_table=constants.PREDICTIVE_RESULTS,
                               sep=None,
                               col='Ratio',
                               cutoff=0.25,
                               absolute=True,
                               greater=True,
                               kwargs=None):
    """
    Gets genome region presence absence data from input_table, but performs
    initial feature selection using the values in col in filter_table. Uses
    utils.parse_metadata

    Args:
        input_table (str):  A binary_table output by panseq
        filter_table (str): A csv table to filter input_table by.
        sep (str):          The delimiter used in both tables.
        col (str):          Column name for the decision column in filter_table
        cutoff (float):     What the values in col are compared to,
        absolute (bool):    If true the absolute value of values in col is used
        greater (bool):     If true values in "col" must be greater than cutoff
        kwargs (dict):      Arguments to be passed to parse_metadata.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}

    labels = parse_metadata(**kwargs)
    train_label = labels[0]
    y_train = labels[1]
    test_label = labels[2]
    y_test = labels[3]

    if sep is None:
        input_data = pd.read_csv(input_table,
                                 sep=sep,
                                 engine='python',
                                 index_col=0)
        filter_data = pd.read_csv(filter_table,
                                  sep=sep,
                                  engine='python',
                                  index_col=0)
    else:
        input_data = pd.read_csv(input_table, sep=sep, index_col=0)
        filter_data = pd.read_csv(filter_table, sep=sep, index_col=0)

    if absolute and greater:
        data = input_data.loc[filter_data.loc[
            abs(filter_data[col]) > cutoff].index]
    elif absolute and not greater:
        data = input_data.loc[filter_data.loc[
            abs(filter_data[col]) < cutoff].index]
    elif not absolute and greater:
        data = input_data.loc[filter_data.loc[filter_data[col] > cutoff].index]
    elif not absolute and not greater:
        data = input_data.loc[filter_data.loc[filter_data[col] < cutoff].index]

    x_train = []
    x_test = []

    for header in train_label:
        x_train.append(data[header].tolist())

    for header in test_label:
        x_test.append(data[header].tolist())

    x_train = np.asarray(x_train)
    x_test = np.asarray(x_test)

    feature_names = np.asarray(data.index)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_label, le)
Example #17
0
def get_genome_prefiltered(input_table=constants.GENOME_REGION_TABLE,
                           filter_table=constants.PREDICTIVE_RESULTS,
                           sep=None,
                           count=50,
                           kwargs=None):
    """
    Gets genome region presence absence from input_table for the genomes
    specified by kwargs. Does initial feature selection by using only the
    features in the top count rows of filter_table. Uses utils.parse_metadata

    Args:
        input_table (str):  A binary_table output by panseq
        filter_table (str): A table containing all the same rows as
                            input_table, but different columns.
        sep (str or None):  The delimiter used in input_table and filter_table
        count (int):        How many of the top rows to keep.
        kwargs (dict):      Arguments to be passed to parse_metadata.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}

    labels = parse_metadata(**kwargs)
    train_label = labels[0]
    y_train = labels[1]
    test_label = labels[2]
    y_test = labels[3]

    if sep is None:
        input_data = pd.read_csv(input_table,
                                 sep=sep,
                                 engine='python',
                                 index_col=0)
        validation_data = pd.read_csv(filter_table,
                                      sep=sep,
                                      engine='python',
                                      index_col=0)
    else:
        input_data = pd.read_csv(input_table, sep=sep, index_col=0)
        validation_data = pd.read_csv(validation_data, sep=sep, index_col=0)

    validation_data = validation_data.head(count)
    input_data = input_data.loc[validation_data.index]

    x_train = []
    x_test = []

    for header in train_label:
        x_train.append(input_data[header].tolist())

    for header in test_label:
        x_test.append(input_data[header].tolist())

    x_train = np.asarray(x_train)
    x_test = np.asarray(x_test)

    feature_names = np.asarray(input_data.index)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_label, le)
Example #18
0
 def setUp(self):
     self.default = parse_metadata()
     self.empty = parse_metadata(**{})