def __init__(self, query_path, db_path, params=None, algorithm="blastn", version="plus" or "legacy", out_path=None, executable=None): # Save attributes # self.path = query_path self.query = FASTA(query_path) self.db = FilePath(db_path) self.version = version self.algorithm = algorithm self.params = params if params else {} self.executable = FilePath(executable) # Output # if out_path is None: self.out_path = self.query.prefix_path + '.blastout' elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + '.blastout' else: self.out_path = out_path self.out_path = FilePath(self.out_path) # Defaults # self.cpus = multiprocessing.cpu_count() if self.version == 'plus': if '-num_threads' not in self.params: self.params['-num_threads'] = self.cpus if self.version == 'legacy': if '-a' not in self.params: self.params['-a'] = self.cpus
def only_top_sequences(self): """Make a new fasta file where only the top N sequences are included (in terms of their abundance). Skipped if no abundance info is given.""" if not self.abundances: return self.renamed_fasta if self.N is None: return self.renamed_fasta # Parse it # N = int(self.N) # Create file # only_top_fasta = FASTA(self.out_dir + 'top_seqs.fasta') # Print status # print "Using: " + self.renamed_fasta print "--> STEP 1B: Get the top %i sequences (in terms of their abundances)." % N # Check the user inputted value # if N > self.input_file.count: msg = "You asked for the top %i sequences" msg += ", but your input file only contains %i sequences!" msg = msg % (self.N, self.input_file.count) warnings.warn(msg, UserWarning) N = self.input_file.count # Do it # ids = self.df_abundances.sum(axis=1).sort_values( ascending=False).index[0:N] ids = set([self.orig_names_to_renamed[x] for x in ids]) self.renamed_fasta.extract_sequences(only_top_fasta, ids) self.timer.print_elapsed() return only_top_fasta
def __init__(self, input_file, seq_type = 'nucl', search_algo = 'blast', search_db = 'nt', normalization = 'flat', proportional = True, backtracking = False, restrict = None, num_threads = None, out_dir = None, min_identity = 0.97, e_value = 0.0001, max_targets = 10, min_coverage = 0.97, abundances = None, N = None): # Base parameters # self.input_file = FASTA(input_file) self.input_file.must_exist() # Abundance file # self.abundances = FilePath(abundances) if self.abundances: self.abundances.must_exist() # Other parameters # self.N = N self.seq_type = seq_type self.backtracking = bool(backtracking) self.proportional = bool(proportional) # Normalization parameters # options = ('flat', 'ui', 'upui') message = 'Normalization has to be one of %s' % (','.join(options)) if normalization not in options: raise Exception(message) self.normalization = normalization # Restrict parameter # message = "The '--restrict' parameter must be an ENVO term, not '%s'." if restrict and not restrict[:5] == 'ENVO:': raise Exception(message % restrict) message = "The '--restrict' parameter must be a known ENVO term." if restrict and not restrict in self.serial_to_concept.values(): raise Exception(message) self.restrict = restrict # Search parameters # self.search_algo = search_algo self.search_db = search_db # Number of cores to use # if num_threads is None: self.num_threads = min(multiprocessing.cpu_count(), 32) else: self.num_threads = int(num_threads) self.num_threads = min(self.num_threads, self.input_file.count) # Hit filtering parameters # self.min_identity = float(min_identity) self.e_value = float(e_value) self.max_targets = int(max_targets) self.min_coverage = float(min_coverage) # Time the pipeline execution # self.timer = Timer() # Keep all outputs in a directory # if out_dir is None: self.out_dir = self.input_file.directory else: self.out_dir = out_dir if not self.out_dir.endswith('/'): self.out_dir += '/' if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) # The object that can make the outputs for the user # self.outputs = OutputGenerator(self)
def renamed_fasta(self): """Make a new fasta file where every name in the input FASTA file is replaced with "C1", "C2", "C3" etc. Returns this new FASTA file.""" renamed_fasta = FASTA(self.out_dir + 'renamed.fasta') if renamed_fasta.exists: return renamed_fasta print "--> STEP 1: Parse the input FASTA file." self.input_file.rename_sequences(renamed_fasta, self.orig_names_to_renamed) self.timer.print_elapsed() return renamed_fasta
def __init__(self, query_path, db_path, params=None, out_path=None, executable=None): # Save attributes # self.query = FASTA(query_path) self.db = db_path self.params = params if params else {} self.executable = FilePath(executable) # Output # if out_path is None: self.out_path = self.query.prefix_path + '.vsearchout' elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + '.vsearchout' else: self.out_path = out_path self.out_path = FilePath(self.out_path)
def __init__(self, path, num_parts=None, part_size=None, base_dir=None): # Basic # self.path = path # Directory # if base_dir is None: self.base_dir = DirectoryPath(path + '.parts/') else: self.base_dir = DirectoryPath(base_dir) # Num parts # if num_parts is not None: self.num_parts = num_parts # Evaluate size # if part_size is not None: self.bytes_target = part_size #humanfriendly.parse_size(part_size) self.num_parts = int( math.ceil(self.count_bytes / self.bytes_target)) # Make parts # self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i self.parts = [ FASTA(self.make_name(i)) for i in range(1, self.num_parts + 1) ] # Give a number to each part # for i, part in enumerate(self.parts): part.num = i
""" ======================== Generate fake abundances ======================== """ # Modules # import os, inspect, numpy, pandas, names from seqenv.fasta import FASTA # Constants # current_script = inspect.getframeinfo(inspect.currentframe()).filename current_dir = os.path.dirname(os.path.abspath(current_script)) + '/' fasta = FASTA(current_dir + "../examples/samples/community.fasta") ################################################################################ def data(): """Create some fake data in a dataframe""" x_size = len(fasta) y_size = 10 numpy.random.seed(0) M = numpy.random.randint(0, 1000, (x_size, y_size)) df = pandas.DataFrame( M, index=[seq.id for seq in fasta], columns=[names.get_first_name() for j in range(y_size)]) return df df = data()