def filter_feature_file(feature_file, lineage_file, filtered_features_file): """ This function takes a file with features and filters it such that only one member of each lineage is chosen to represent the whole lineage :param feature_file: A file containing a mapping from ID to features :param lineage_file: A file containing a mapping from ID to lineage :return: None. """ ids, features = fileIO.read_feature_file(feature_file) lineage_dict = fileIO.read_lineage_file(lineage_file) used_lineages = [] which = [] for i in xrange(len(ids)): if lineage_dict[ids[i]] not in used_lineages: which += [i] used_lineages += [lineage_dict[ids[i]]] filtered_ids = ids[which] filtered_features = features[which] header = "# Filtered k-mer count file" header += "\n# Filtered from: %s" % feature_file header += "\n# Filtered by: %s" % lineage_file fileIO.save_counts(filtered_features, filtered_ids, filtered_features_file, header=header)
def load_counts(kmer_length, location=None, counts_file=None, identifier='fna', normalize=False, symbols=DNA): """ This file is for loading headers and k-mer counts from either a directory containing fasta files, a single fasta file, or a pre-counted headers and k-mer count file. This funciton will first check for pre-counted files and will then check to see if the "location" is a directory or a file to decide how to count it :param kmer_length: The k-mer length to count :param location: The directory containing fasta files OR fasta file :param headers: The filename of the headers file :param counts: The filename of the k-mer count file :param symbols: Symbols to use for counting :return: A tuple containing a list of headers, and list of k-mer coutns as a numpy array, in that order """ if counts_file and os.path.isfile(counts_file): logger.info("Loading %d-mers from %s..." % (kmer_length, os.path.basename(counts_file))) ids, counts = fileIO.load_feature_file(counts_file, normalize=normalize) elif location and os.path.isfile(location): logger.info("Counting %d-mers in %s..." % (kmer_length, os.path.basename(location))) ids, counts = count_file(location, kmer_length, normalize=normalize, symbols=symbols) if counts_file: fileIO.save_counts(counts, ids, count_file) elif location and os.isdir(location): logger.info("Counting %d-mers in %s..." % (kmer_length, os.path.basename(location))) ids, counts = count_directory(location, kmer_length, normalize=normalize, identifier=identifier, symbols=symbols) if counts_file: fileIO.save_counts(counts, ids, count_file) logger.info("Data loaded.") return ids, counts
def load_data(self): """ This function loads the relevant data into a phamer_scorer object :param args: An argparse parsed arguments object from this script :return: None. scorer object is modified in place """ # loading reference data if self.positive_features and os.path.exists(self.positive_features_file): logger.info("Reading positive features from: %s" % os.path.basename(self.positive_features_file)) scorer.positive_ids, scorer.positive_data = fileIO.read_feature_file(self.positive_features_file, normalize=True) elif self.positive_fasta and os.path.exists(self.positive_fasta): logger.info("Counting positive k-mers from: %s" % os.path.basename(self.positive_fasta)) scorer.positive_ids, scorer.positive_data = kmer.count(self.positive_fasta) if self.negative_features_file and os.path.exists(self.negative_features_file): logger.info("Reading negative features from: %s" % os.path.basename(self.negative_features_file)) scorer.negative_ids, scorer.negative_data = fileIO.read_feature_file(self.negative_features_file, normalize=True) elif self.negative_fasta and os.path.exists(self.negative_fasta): logger.info("Counting negative k-mers from: %s" % os.path.basename(self.negative_fasta)) scorer.negative_ids, scorer.negative_data = kmer.count(self.negative_fasta) scorer.find_input_files() # Loading input data if self.features_file is not None and os.path.exists(self.features_file): logger.info("Reading features from: %s..." % os.path.basename(self.features_file)) scorer.data_ids, scorer.data_points = fileIO.read_feature_file(self.features_file) elif self.fasta_file is not None and os.path.exists(self.fasta_file): logger.info("Calculating features of: %s" % os.path.basename(self.fasta_file)) self.data_ids, self.data_points = kmer.count_file(self.fasta_file, self.kmer_length, normalize=False) self.features_file = "{base}_features.csv".format(base=os.path.splitext(self.fasta_file)[0]) logger.info("Saving features to {file}...".format(file=self.features_file)) fileIO.save_counts(self.data_points, self.data_ids, self.features_file) else: logger.error("No input fasta file or features file. Exiting...") exit() self.data_points = kmer.normalize_counts(self.data_points) if args.length_requirement: scorer.screen_by_length()
if args.debug: logger.setLevel(logging.DEBUG) logging.basicConfig( format='[%(asctime)s][%(levelname)s][%(funcName)s] - %(message)s') elif args.verbose: logger.setLevel(logging.INFO) logging.basicConfig( format='[%(asctime)s][%(levelname)s][%(funcName)s] - %(message)s') else: logger.setLevel(logging.WARNING) logging.basicConfig(format='[log][%(levelname)s] - %(message)s') logger.info("Counting k-mers...") if input and os.path.isfile(input): ids, kmers = count_file(input, kmer_length, symbols=symbols) elif input and os.path.isdir(input): ids, kmers = count_directory(input, kmer_length, symbols=symbols, identifier=file_identifier, sample=sample) else: logger.error("%s was not an acceptable file or directory" % input) exit(1) fileIO.save_counts(kmers, ids, output, args=args) logger.info("K-mer counting complete.")
def load_data(self): """ This function loads all the data necessary for plotting into memory. This function will also make the necessary directories for where the outputs shoudl be placed :return: None """ if self.output_directory and not os.path.isdir(self.output_directory): try: os.mkdir(self.output_directory) except: logger.error("Could not create: %s" % self.output_directory) logger.error( "Resolve this by creating this directory yourself and re-running" ) exit(1) if self.features_file and os.path.exists(self.features_file): # Loading features logger.info("Loading features from: %s ..." % os.path.basename(self.features_file)) self.id_list, self.features = fileIO.read_feature_file( self.features_file, normalize=True) logger.info("Loaded features.") elif self.fasta_file and os.path.exists(self.fasta_file): # Calculating Features logger.info("No feature file provided, calculating features...") self.id_list, self.features = kmer.count_file(self.fasta_file, 4, normalize=True) self.features_outfile = self.get_kmers_out_filename() logger.info("Calculated features. Saving features to: %s" % os.path.basename(self.features_outfile)) fileIO.save_counts(self.features, self.id_list, self.features_outfile, args=args) logger.info("Saved features.") if not self.do_tsne and os.path.isfile( self.tsne_file) and os.path.isfile(self.features_file): # Loading t-SNE data logger.info("Loading t-SNE data from: %s ... " % os.path.basename(self.tsne_file)) self.id_list, self.tsne_data, _ = fileIO.read_tsne_file( self.tsne_file) logger.info("Loaded t-SNE data.") else: # Doing t-SNE logger.info("Performing t-SNE...") if self.PCA_preprocess: logger.info("Pre-processing with PCA...") pca_data = PCA( n_components=self.pca_preprocess_red).fit_transform( self.features) self.tsne_data = TSNE( perplexity=self.perplexity, verbose=True, random_state=self.tsne_seed, init=self.tsne_init, early_exaggeration=self.early_exaggeration, learning_rate=self.tsne_learning_rate).fit_transform( pca_data) else: self.tsne_data = TSNE( perplexity=self.perplexity, verbose=True, random_state=self.tsne_seed, init=self.tsne_init, early_exaggeration=self.early_exaggeration, learning_rate=self.tsne_learning_rate).fit_transform( self.features) logger.info("t-SNE complete.") self.tsne_file = self.get_tsne_filename() fileIO.save_tsne_data(self.tsne_file, self.tsne_data, self.id_list) logger.info("Saved t-SNE to: %s" % os.path.basename(self.tsne_file)) logger.info("Loading lineages from: %s ..." % os.path.basename(self.lineage_file)) self.lineages = self.get_lineages()