Example #1
0
def filter_feature_file(feature_file, lineage_file, filtered_features_file):
    """
    This function takes a file with features and filters it such that only one member of each lineage is chosen
    to represent the whole lineage
    :param feature_file: A file containing a mapping from ID to features
    :param lineage_file: A file containing a mapping from ID to lineage
    :return: None.
    """
    ids, features = fileIO.read_feature_file(feature_file)
    lineage_dict = fileIO.read_lineage_file(lineage_file)
    used_lineages = []
    which = []
    for i in xrange(len(ids)):
        if lineage_dict[ids[i]] not in used_lineages:
            which += [i]
            used_lineages += [lineage_dict[ids[i]]]
    filtered_ids = ids[which]
    filtered_features = features[which]
    header = "# Filtered k-mer count file"
    header += "\n# Filtered from: %s" % feature_file
    header += "\n# Filtered by: %s" % lineage_file
    fileIO.save_counts(filtered_features,
                       filtered_ids,
                       filtered_features_file,
                       header=header)
Example #2
0
def load_counts(kmer_length,
                location=None,
                counts_file=None,
                identifier='fna',
                normalize=False,
                symbols=DNA):
    """
    This file is for loading headers and k-mer counts from either a directory containing fasta files, a single fasta
    file, or a pre-counted headers and k-mer count file. This funciton will first check for pre-counted files and will
    then check to see if the "location" is a directory or a file to decide how to count it
    :param kmer_length: The k-mer length to count
    :param location: The directory containing fasta files OR fasta file
    :param headers: The filename of the headers file
    :param counts: The filename of the k-mer count file
    :param symbols: Symbols to use for counting
    :return: A tuple containing a list of headers, and list of k-mer coutns as a numpy array, in that order
    """
    if counts_file and os.path.isfile(counts_file):
        logger.info("Loading %d-mers from %s..." %
                    (kmer_length, os.path.basename(counts_file)))
        ids, counts = fileIO.load_feature_file(counts_file,
                                               normalize=normalize)
    elif location and os.path.isfile(location):
        logger.info("Counting %d-mers in %s..." %
                    (kmer_length, os.path.basename(location)))
        ids, counts = count_file(location,
                                 kmer_length,
                                 normalize=normalize,
                                 symbols=symbols)
        if counts_file:
            fileIO.save_counts(counts, ids, count_file)
    elif location and os.isdir(location):
        logger.info("Counting %d-mers in %s..." %
                    (kmer_length, os.path.basename(location)))
        ids, counts = count_directory(location,
                                      kmer_length,
                                      normalize=normalize,
                                      identifier=identifier,
                                      symbols=symbols)
        if counts_file:
            fileIO.save_counts(counts, ids, count_file)
    logger.info("Data loaded.")
    return ids, counts
Example #3
0
    def load_data(self):
        """
        This function loads the relevant data into a phamer_scorer object
        :param args: An argparse parsed arguments object from this script
        :return: None. scorer object is modified in place
        """
        # loading reference data
        if self.positive_features and os.path.exists(self.positive_features_file):
            logger.info("Reading positive features from: %s" % os.path.basename(self.positive_features_file))
            scorer.positive_ids, scorer.positive_data = fileIO.read_feature_file(self.positive_features_file, normalize=True)
        elif self.positive_fasta and os.path.exists(self.positive_fasta):
            logger.info("Counting positive k-mers from: %s" % os.path.basename(self.positive_fasta))
            scorer.positive_ids, scorer.positive_data = kmer.count(self.positive_fasta)

        if self.negative_features_file and os.path.exists(self.negative_features_file):
            logger.info("Reading negative features from: %s" % os.path.basename(self.negative_features_file))
            scorer.negative_ids, scorer.negative_data = fileIO.read_feature_file(self.negative_features_file, normalize=True)
        elif self.negative_fasta and os.path.exists(self.negative_fasta):
            logger.info("Counting negative k-mers from: %s" % os.path.basename(self.negative_fasta))
            scorer.negative_ids, scorer.negative_data = kmer.count(self.negative_fasta)

        scorer.find_input_files()
        # Loading input data
        if self.features_file is not None and os.path.exists(self.features_file):
            logger.info("Reading features from: %s..." % os.path.basename(self.features_file))
            scorer.data_ids, scorer.data_points = fileIO.read_feature_file(self.features_file)
        elif self.fasta_file is not None and os.path.exists(self.fasta_file):
            logger.info("Calculating features of: %s" % os.path.basename(self.fasta_file))
            self.data_ids, self.data_points = kmer.count_file(self.fasta_file, self.kmer_length, normalize=False)
            self.features_file = "{base}_features.csv".format(base=os.path.splitext(self.fasta_file)[0])
            logger.info("Saving features to {file}...".format(file=self.features_file))
            fileIO.save_counts(self.data_points, self.data_ids, self.features_file)
        else:
            logger.error("No input fasta file or features file. Exiting...")
            exit()

        self.data_points = kmer.normalize_counts(self.data_points)

        if args.length_requirement:
            scorer.screen_by_length()
Example #4
0
    if args.debug:
        logger.setLevel(logging.DEBUG)
        logging.basicConfig(
            format='[%(asctime)s][%(levelname)s][%(funcName)s] - %(message)s')
    elif args.verbose:
        logger.setLevel(logging.INFO)
        logging.basicConfig(
            format='[%(asctime)s][%(levelname)s][%(funcName)s] - %(message)s')
    else:
        logger.setLevel(logging.WARNING)
        logging.basicConfig(format='[log][%(levelname)s] - %(message)s')

    logger.info("Counting k-mers...")

    if input and os.path.isfile(input):
        ids, kmers = count_file(input, kmer_length, symbols=symbols)
    elif input and os.path.isdir(input):
        ids, kmers = count_directory(input,
                                     kmer_length,
                                     symbols=symbols,
                                     identifier=file_identifier,
                                     sample=sample)
    else:
        logger.error("%s was not an acceptable file or directory" % input)
        exit(1)

    fileIO.save_counts(kmers, ids, output, args=args)

    logger.info("K-mer counting complete.")
Example #5
0
    def load_data(self):
        """
        This function loads all the data necessary for plotting into memory.
        This function will also make the necessary directories for where the outputs
        shoudl be placed
        :return: None
        """

        if self.output_directory and not os.path.isdir(self.output_directory):
            try:
                os.mkdir(self.output_directory)
            except:
                logger.error("Could not create: %s" % self.output_directory)
                logger.error(
                    "Resolve this by creating this directory yourself and re-running"
                )
                exit(1)

        if self.features_file and os.path.exists(self.features_file):
            # Loading features
            logger.info("Loading features from: %s ..." %
                        os.path.basename(self.features_file))
            self.id_list, self.features = fileIO.read_feature_file(
                self.features_file, normalize=True)
            logger.info("Loaded features.")
        elif self.fasta_file and os.path.exists(self.fasta_file):
            # Calculating Features
            logger.info("No feature file provided, calculating features...")
            self.id_list, self.features = kmer.count_file(self.fasta_file,
                                                          4,
                                                          normalize=True)
            self.features_outfile = self.get_kmers_out_filename()
            logger.info("Calculated features. Saving features to: %s" %
                        os.path.basename(self.features_outfile))
            fileIO.save_counts(self.features,
                               self.id_list,
                               self.features_outfile,
                               args=args)
            logger.info("Saved features.")

        if not self.do_tsne and os.path.isfile(
                self.tsne_file) and os.path.isfile(self.features_file):
            # Loading t-SNE data
            logger.info("Loading t-SNE data from: %s ... " %
                        os.path.basename(self.tsne_file))
            self.id_list, self.tsne_data, _ = fileIO.read_tsne_file(
                self.tsne_file)
            logger.info("Loaded t-SNE data.")
        else:
            # Doing t-SNE
            logger.info("Performing t-SNE...")
            if self.PCA_preprocess:
                logger.info("Pre-processing with PCA...")
                pca_data = PCA(
                    n_components=self.pca_preprocess_red).fit_transform(
                        self.features)
                self.tsne_data = TSNE(
                    perplexity=self.perplexity,
                    verbose=True,
                    random_state=self.tsne_seed,
                    init=self.tsne_init,
                    early_exaggeration=self.early_exaggeration,
                    learning_rate=self.tsne_learning_rate).fit_transform(
                        pca_data)
            else:
                self.tsne_data = TSNE(
                    perplexity=self.perplexity,
                    verbose=True,
                    random_state=self.tsne_seed,
                    init=self.tsne_init,
                    early_exaggeration=self.early_exaggeration,
                    learning_rate=self.tsne_learning_rate).fit_transform(
                        self.features)
            logger.info("t-SNE complete.")
            self.tsne_file = self.get_tsne_filename()
            fileIO.save_tsne_data(self.tsne_file, self.tsne_data, self.id_list)
            logger.info("Saved t-SNE to: %s" %
                        os.path.basename(self.tsne_file))

        logger.info("Loading lineages from: %s ..." %
                    os.path.basename(self.lineage_file))
        self.lineages = self.get_lineages()