def get_by_directory(directory): """ Renames all files in directory based on filename :param directory: (str) Name of directory with files to rename """ # Correct passed data type based on extension # New directory name and create if not exists new_location = os.path.join(directory, "tmp") if not os.path.isdir(new_location): os.makedirs(new_location) # Get all files in user-passed directory for file in [f for f in os.listdir(directory) if os.path.isfile(f)]: # Split file name from extension, get name of old file data_type = BioOps.get_type(file) _file, ext = os.path.splitext(file) fullpath_old = os.path.join(directory, file) # Get all records in file records_in_file = BioOps.parse_large(fullpath_old, data_type) # Rename ids based on file name, add number to end to distinguish num = 0 for record in records_in_file: record.id += "_" + _file + "_" + str(num) num += 1 # Write renamed version of file to .tmp directory fullpath_new = os.path.join(new_location, _file + ext) with open(fullpath_new, "w") as O: SeqIO.write(records_in_file, O, data_type)
def get_seq_from_file(file_name, seq_id): """ Locate sequence by ID/description (regex supported) and save to file :param file_name: (str) User-passed name of file :param seq_id: (str) User-passed sequence id to get from file """ file_type = BioOps.get_type(file_name) value_to_return = set() seq_num = None data = {} try: data = { record.id: record for record in BioOps.parse_large(file_name, file_type) } value_to_return = data[seq_id] # If value is not found in parsed file, try regex except KeyError: seq_num = 0 possible_seqs = list(data.keys()) possible_descriptions = { val.id: val.description for val in data.values() } is_found = False # Try each sequence to see if matches regex for seq in possible_seqs: found = re.search(seq_id, seq) if found: value_to_return.add(data[seq]) is_found = True seq_num += 1 # Also check description to see for regex matches for _id, desc in possible_descriptions.items(): found = re.search(seq_id, desc) if found: value_to_return.add(data[_id]) is_found = True seq_num += 1 if not is_found: print("Could not locate %s in %s" % (seq_id, file_name)) raise SequenceIdNotFoundError(seq_id) file_name = os.path.splitext(file_name)[0] # Write found sequence to file out_file = "{}.{}".format(seq_id, file_name) W = open(out_file, "w") SeqIO.write(value_to_return, out_file, file_type) W.close() if not seq_num: print(" Sequence {} copied from file {} to {}".format( seq_id, file_name, out_file)) else: print("{} sequence(s) copied from file {} to {}".format( seq_num, file_name, out_file))
def reverse_complement_file(file_name): """ Reverse complements fasta sequences in file :param file_name: (str) Name of file """ data_type = BioOps.get_type(file_name) records_in_file = BioOps.parse_large(file_name, data_type) for record in records_in_file: record.reverse_complement() record.id += ".revComp" SeqIO.write(records_in_file, "tmp." + file_name, data_type)
def rewrite_ids_in_fastx(image_mapper, file_name, remove_description=True): """ Rename fastx ids using index file, write to new file :param remove_description: (bool) Only write ids in new file :param image_mapper: (IndexMapper) Mapped index file :param file_name: (str) Name of fastx file """ data_type = BioOps.get_type(file_name) records = BioOps.parse_large(file_name, data_type) for record in records: record.id = image_mapper.get(record.id) if remove_description: record.description = "" SeqIO.write(records, file_name + IndexExtensions.match[data_type], data_type)
def get_by_file(file_name): """ Renames sequences in file based on file name :param file_name: (str) Name of file to rename """ # Location to write data new_location = file_name + ".tmp" # Parse data file data_type = BioOps.get_type(file_name) records_in_file = BioOps.parse_large(file_name, data_type) num = 0 for record in records_in_file: # Rename sequence id based on file name and a number for uniqueness record.id = file_name + "_" + str(num) num += 1 # Write sequences to new file with open(new_location, "w") as O: SeqIO.write(records_in_file, O, data_type)
def get_seqs_from_file(file_name, number): """ Retrieve a number of sequences from a file :param file_name: (str) Name of file to parse :param number: (str) User-passed number, list or numbers, or range to get """ indices_to_get = _translate_number_string_to_interval(number) to_return = [] data_type = BioOps.get_type(file_name) data = BioOps.parse_large(file_name, data_type) # Build list of values to return based on user-passed indices (list, range, or value) for index in indices_to_get: to_return.append(data[index]) # Write found sequence to file out_file = "selected.{}".format(file_name) W = open(out_file, "w") SeqIO.write(to_return, out_file, data_type) W.close() print(" {} sequences copied from file {} to {}".format( len(indices_to_get), file_name, out_file))
def summarize_file(file_name, view): """ Outputs character count summary of file :param view: (str) User-passed view value :param file_name: (str) Name of file for which to gather character data """ # Corrected view name and data type data_type = BioOps.get_type(file_name) view = _view_corrector(view) # File to read record_metadata = {} num_records = 0 R = open(file_name, "rb") # Collect file data data = [] if data_type == "fasta": # Read in file by fasta header, removing newlines for key, group in it.groupby( R, lambda line: line.decode().startswith(">")): data.append( [gr.decode().rstrip("\r\n").strip(">") for gr in list(group)]) # Retain record number num_records = len(data) // 2 # Collect metadata by record for i in range(0, len(data), 2): record_metadata[data[i][0]] = (Counter(data[i][0]), Counter("".join(data[i + 1]))) elif data_type == "fastq": # Read in file by fastq header, removing newlines for key, group in it.groupby(R, lambda line: line.startswith("@")): data.append([gr.rstrip("\r\n").strip("@") for gr in list(group)]) # Retain record number num_records = len(data) // 2 # Collect metadata by record # Note indices based on presence on "+" in records for i in range(0, len(data), 2): plus_loc = data[i + 1].index("+") record_metadata[data[i][0]] = (Counter( data[i][0]), Counter("".join(data[i + 1][:plus_loc])), Counter("".join( data[i + 1][plus_loc + 1:]))) R.close() # Output general summary _summary_all(num_records, file_name) # Output based on user-passed value if view == "s": _summary_short(record_metadata) elif view == "l": _summary_long(record_metadata)
def remove_ambiguity_from_file(file_name): """ Removes N from data file in-line :param file_name: (str) Name of file to edit """ data = [] data_type = BioOps.get_type(file_name) records_in_file = BioOps.parse_large(file_name, data_type) for record in records_in_file: # Build new sequence new_seq = "" for val in record.seq: # Skip ambiguous character if val.upper() not in ("R", "Y", "W", "S", "M", "K", "H", "B", "V", "D", "N"): new_seq += val # Try DNA sequence type first (more restrictive) try: record.seq = Seq(new_seq, IUPAC.unambiguous_dna) # Default to protein otherwise except: record.seq = Seq(new_seq, IUPAC.unambiguous_rna) data.append(record) SeqIO.write(data, file_name, data_type)
def make_idx_for_directory_file_names(directory_name): """ Make index file for filenames in a directory :param directory_name: (str) Name of directory with files to rename :return: """ current_dir = os.getcwd() W = open( os.path.join( current_dir, os.path.dirname(directory_name) + IndexExtensions.IDX_FILE), "wb") files_in_directory = set(file for file in os.listdir(directory_name)) for _file in files_in_directory: data_type = BioOps.get_type(_file) W.write(("%s\t%s\n" % (_file, IndexCreator._line_edit(16) + "." + data_type)).encode()) W.close()
def _summary_long(record_metadata): """ Print summary that highlights each record's information :param record_metadata: (Dict[str, Tuple[Counter, Counter]]) Dict with file data """ sorted_keys = sorted(record_metadata.keys()) for _id in sorted_keys: met_tuple = record_metadata[_id] print("Id: %s" % _id) print("Values found in header:") for k, v in met_tuple[0].items(): print("# %s: %i" % (k, v)) print("Values found in sequences:") for k, v in met_tuple[1].items(): print("# %s: %i" % (k, v)) if len(met_tuple) == 3: print("Values found in quality scores:") for k, v in met_tuple[2].items(): print("# %s: %i" % (BioOps.calculate_phred([k])[0], v))
def _summary_short(record_metadata): """ Short summary that highlighting all data :param record_metadata: (Dict[str, Tuple[Counter, Counter]]) Dict with file data """ all_header_metadata = Counter("") all_sequence_metadata = Counter("") all_quality_metadata = Counter("") for met_tuple in record_metadata.values(): all_header_metadata += met_tuple[0] all_sequence_metadata += met_tuple[1] if len(met_tuple) == 3: all_quality_metadata += met_tuple[2] print("Values found in headers:") for k, v in all_header_metadata.items(): print("# %s: %i" % (k, v)) print("Values found in sequences:") for k, v in all_sequence_metadata.items(): print("# %s: %i" % (k, v)) if len(all_quality_metadata.keys()) > 0: print("Values found in quality scores:") for k, v in all_sequence_metadata.items(): print("# %s: %i" % (BioOps.calculate_phred([k])[0], v))