def create_sample_data(input_seqs, sample_size): """ Takes a sample of size 'sample_size' from an input file containing sequences and their associated expression levels, and writes them to a separate file. The format of the first 2 lines of the resulting output file will be of the format: " number_of_seqs_in_file\t<###> length_of_each_sequence\t<$$$> " where '<###>' is the number of sequences in the file, and '<$$$>'is the length to which every sequence in the file is padded. Args: ----- input_seqs (str) -- the absolute path of the input file containing sequence and expression level data to sample. sample_size (int) -- the number of samples to take from the input file. Returns: ----- sample_data (str) -- the absolute path of the output file containing the sample of sequence and expression level data. """ # Assertions assert isinstance(input_seqs, str), 'Input sequences file path must be\ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' assert isinstance(sample_size, int), 'Number of sequences to sample must\ be passed as an integer.' assert sample_size < get_seq_count(input_seqs), 'Sample size must be\ smaller than the number of sequences in the input file.' # Functionality # Define output file path index = input_seqs.rfind('/') + 1 insert = str(sample_size) + '_from_' sample_seqs = input_seqs[:index] + insert + input_seqs[index:] # Pull sequences to create sample data with smart_open(input_seqs, 'r') as inf: inf.readline() inf.readline() # skip the first 2 info lines all_lines = inf.readlines() for i in range(50): lines = random.sample(all_lines, sample_size) with smart_open(sample_seqs, 'w') as g: for line in lines: g.write(line) # Write number and length of sequence info to top of resulting file write_num_and_len_of_seqs_to_file(sample_seqs) return sample_seqs
def write_num_and_len_of_seqs_to_file(input_seqs): """ Prepends the number of sequences and the length of the sequences in an input file to the first 2 lines of the file. Assumes sequences have been processed so that all sequences have been padded to the same length. The first 2 lines of the input file will be in the following format after writing the info to the file: " number_of_seqs_in_file\t<###>\n length_of_each_sequence\t<$$$>\n " where '<###>' is the number of sequences in the file, and '<$$$>'is the length to which every sequence in the file is padded. Args: ----- input_seqs (str) -- the absolute path of the processed input sequences to extract information from. Returns: ----- None """ # Assertions assert isinstance(input_seqs, str), 'Absolute pathname must be passed\ as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality num_seqs = get_seq_count(input_seqs) with smart_open(input_seqs, 'r') as f: line = check_valid_line(f.readline()) if line == 'skip_line': raise AssertionError('First line is not valid.') seq, _ = separate_seq_and_el_data(line) len_seqs = len(seq) # assumes all sequences padded to same length with smart_open(input_seqs, 'r+') as f: contents = f.read() with smart_open(input_seqs, 'w+') as f: line_to_append = 'number_of_seqs_in_file\t' + str(num_seqs) + '\n' line_to_append += 'length_of_each_sequence\t' + str(len_seqs) + '\n' if input_seqs.endswith('.gz'): line_to_append = line_to_append.encode() f.write(line_to_append + contents) return
def get_max_min_mode_length_of_seqs(input_seqs): """ Returns the maximum, minimum, and modal length of the sequences in a file containing input sequences. Args: ----- input_seqs (str) -- the absolute path of the file containing the input sequences and their expression levels, tab separated. Returns: ----- max_length (int) -- the length of the longest sequence in the input file. min_length (int) -- the length of the shortest sequence in the input file. modal_length (int) -- the most common sequence length of the sequences in the input file. """ # Assertions assert isinstance(input_seqs, str), 'Path name for input file must be \ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality infile = smart_open(input_seqs, 'r') seq_lengths = [] for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) seq_lengths.append(len(seq)) max_length = max(seq_lengths) min_length = min(seq_lengths) modal_length = max(set(seq_lengths), key=seq_lengths.count) # Close the input file. infile.close() return max_length, min_length, modal_length
def sort_by_exp_level(input_seqs): """ Given an input file of sequences tab separated with their associated expression levels, sorts the lines of the file by expression level, with the highest levels at the top of the file. Args: ----- input_seqs (str) -- the absolute path of the input file containing sequences to be sorted by expression level. Returns: ----- sorted_df (pandas.DataFrame) -- a data frame where rows are sorted in descending order based on expression level. """ # Assertions assert isinstance(input_seqs, str), 'Path name for input file must be \ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality with smart_open(input_seqs, 'r') as f: line = check_valid_line(f.readline()) seq1, _ = separate_seq_and_el_data(line) line = check_valid_line(f.readline()) seq2, _ = separate_seq_and_el_data(line) exp_seq1 = 'number_of_seqs_in_file' exp_seq2 = 'length_of_each_sequence' if seq1 == exp_seq1 and seq2 == exp_seq2: skip = 2 else: skip = 0 # Import data into a pandas data frame df = pd.read_csv(input_seqs, sep='\t', names=['seq', 'el'], skiprows=skip) # Sort it based on expression level sorted_df = df.sort_values('el', ascending=False) sorted_df = sorted_df.reset_index() sorted_df = sorted_df.drop(columns='index') return sorted_df
def get_num_and_len_of_seqs_from_file(input_seqs): """ Returns the number of sequences and length of sequences in an input file. Assumes sequences have been processed so that all sequences have been padded to the same length, and that the file containing the process sequences have the first 2 lines in the following format: " number_of_seqs_in_file\t<###> length_of_each_sequence\t<$$$> " where '<###>' is the number of sequences in the file, and '<$$$>'is the length to which every sequence in the file is padded. Args: ----- input_seqs (str) -- the absolute path of the processed input sequences to extract information from. Returns: ----- num_seqs (int) -- the number of sequences in input_seqs. len_seqs (int) -- the length of the all the padded sequences in the input file. """ # Assertions assert isinstance(input_seqs, str), 'Absolute pathname must be passed\ as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality with smart_open(input_seqs, 'r') as f: # Parse first line of file containing info about num of seqs in file first_line = check_valid_line(f.readline()) assert first_line != 'skip_line', 'Invalid first line of file. Must\ be of the form: "number_of_seqs_in_file\t<###>" where <###> is the\ number of sequences in the file.' token, num_seqs = separate_seq_and_el_data(first_line) if num_seqs % 1 != 0: raise ValueError('Number of sequences on first line must be\ an integer.') assert token == 'number_of_seqs_in_file', 'First line of the input\ file must be of the form: "number_of_seqs_in_file\t<###>" where\ <###> is the number of sequences in the file.' # Parse 2nd line of file containing info about length of seqs in file second_line = check_valid_line(f.readline()) assert second_line != 'skip_line', 'Invalid second line of file.\ Must be of the form: "length_of_each_sequence\t<###>" where <###> is\ the length of every sequence in the file.' token, len_seqs = separate_seq_and_el_data(second_line) if len_seqs % 1 != 0: raise ValueError('Sequence length on second line must be an\ integer.') assert token == 'length_of_each_sequence', 'Second line of the input\ file must be of the form: "length_of_each_sequence\t<###>" where\ <###> is the length of every sequence in the file. Assumes\ homogeneity and/or padding of sequences.' return num_seqs, len_seqs
def check_oligonucleotide_flanks(seq_infile, scaffold_type): """ Checks that all the oligonucleotide sequences in an input file consist of the same sequences that flank the variable 80-mer sequence. i.e. all sequences in the input file should be of the form: TGCATTTTTTTCACATC-(variable region)-GTTACGGCTGTT Whereas the input sequences measured in the Abf1TATA scaffold will be of the form: TCACGCAGTATAGTTC-(variable region)-GGTTTATTGTTTATAAAAA These flanking sequences are for in-lab sequencing purposes only, so can be discarded when the 80-mer variable sequences are inserted into the a scaffold sequence. Args: ----- seq_infile (str) -- the absolute path of the input file containing all of the oligonucleotide sequences to be checked, and their expression level values (tab separated). scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) in which the expression levels for the sequences in the input file were measured. Returns: ----- incorrect_lines (list) -- returns a list of line numbers for for sequences that contain incorrect flank sequences. """ # Assertions assert isinstance(seq_infile, str), 'Absolute pathname must be passed \ as a string.' assert isinstance(scaffold_type, str), 'Scaffold type must be passed as a\ string.' assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \ type must be specified as either pTpA or Abf1TATA.' # Functionality if scaffold_type == 'pTpA': flank_A = 'TGCATTTTTTTCACATC' flank_B = 'GGTTACGGCTGTT' elif scaffold_type == 'Abf1TATA': flank_A = 'TCACGCAGTATAGTTC' flank_B = 'GGTTTATTGTTTATAAAAA' infile = smart_open(seq_infile, 'r') line_number = 0 incorrect_lines = [] for line in infile: line_number += 1 line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) if seq.startswith(flank_A) and seq.endswith(flank_B): pass else: incorrect_lines.append(line_number) return incorrect_lines
def pull_homogeneous_seqs(input_seqs, scaffold_type=None): """ Pulls all sequences of the modal length (i.e. 110 bp for pTpA-type sequences and 115 bp for Abf1TATA-type) from an input file and writes them into an output file. Args: ----- input_seqs (str) -- the absolute pathname of the input file containing all of the raw oligonucleotide sequences and their expression levels, tab separated. scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA for which the modal length is known to be 110 and 115 respectively) in which the expression levels for the sequences in the input file were measured. If None, the modal length is calculated manually. Default: None. Returns: ----- absolute_path (str) -- the absolute pathname of the output file containing the sequences of modal length. """ # Assertions assert isinstance(input_seqs, str), ('Input file pathname must be a' 'string.') assert os.path.isfile(input_seqs), 'Input file does not exist!' assert isinstance(scaffold_type, (str, type(None))), 'Scaffold type must\ be passed as a string.' if isinstance(scaffold_type, str): assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaff\ type must be specified as either pTpA or Abf1TATA, or else\ unspecified (in which case it takes value of None).' # Functionality # Defining the path name of the output file. relative_path = 'example/' time_stamp = get_time_stamp() if scaffold_type is None: relative_path += ('other_scaffolds/' + time_stamp + '_homogeneous_seqs.txt') else: relative_path += (scaffold_type + '_data/' + time_stamp + '_' + scaffold_type + '_homogeneous_seqs.txt') absolute_path = os.path.join(ROOT_DIR, relative_path) # Open the input and output files. infile = smart_open(input_seqs, 'r') output_seqs = smart_open(absolute_path, 'w') # Retrieve modal length for sequences in input file. if scaffold_type == 'pTpA': modal_length = 110 elif scaffold_type == 'Abf1TATA': modal_length = 115 else: _, _, modal_length = get_max_min_mode_length_of_seqs(input_seqs) # Find seqs in input file w/ modal length and write them to output file for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) if len(seq) == modal_length: output_seqs.write(seq + '\t' + str(exp_level) + '\n') else: continue # Close the input and output files. infile.close() output_seqs.close() return absolute_path
def encode_sequences_with_method(input_seqs, method='One-Hot', scale_els=True, model_type='1DCNN', binarized_els=False): """ A wrapper function that encodes all of the sequences in an input file according to the specified method, and returns them in a numpy array, as well as returning the associated expression levels in a separate numpy array. Args: ----- input_seqs (str) -- absolute path of the file containing all of the input sequences to be encoded, tab-separated withtheir associated expression levels. The first line of the file must be the number of sequences in the file, of the format: "number_of_seqs_in_file\t<###>" where <###> is the number of sequences in the file. The second line in the file must be the length to which all sequences are padded, of the format: "length_of_each_sequence\t<###>" where <###> is the length of every sequence in the file. Assumes homogeneity and/or padding of sequences. method (str) -- the method by which the sequence should be encoded. Must choose from: 'One-Hot'. Default: 'One-Hot' scale_els (bool) -- if True (default), scales all of the expression levels in the output list exp_levels to between -1 and 1, corresponding to the min and max values respectively. model_type (str) -- the type of model being used. Controls the shape of the returned list that contains the encoded sequences. Must be one of: '1DCNN' (for 1D-convolutional net), '1DLOCCON' (for 1D-locally connected net), or 'LSTM' (for Long-Short-Term-Memory net). Returns: ----- encoded_seqs (numpy.ndarray) -- a list of all the sequences in the input file, encoded with the specified method. Each element (i.e. each encoded sequence) is of type list. Shape of this array depends on 'model_type'. For example, for an input file containing 10000 sequences, each of length 257, where the length of each base vector is 5 (corresponding to bases A,T,G,C,N), the output shapes of encoded_seqs for each model is as follows: '1DCONV' ===> (10000, 257, 5) '1DLOCCON' ===> (10000, 257, 5) 'LSTM' ===> (10000, 1, 1285) where 1285=257*5 exp_levels (numpy.ndarray) -- a list of all the expression levels associated with the sequences. Each element (i.e. each EL) is of type float. Values scaled to between -1 and 1 if argument 'scale_els=True'. abs_max_el (float) -- the maximum expression level value in the input file. Returned only if 'scale_els=True'. """ # Assertions assert isinstance(input_seqs, str), 'TypeError: Input file path must be \ passed as a string.' assert isinstance(method, str), 'TypeError: Specified method must be a \ a string.' assert method in METHODS, 'Must specify one the method of encoding the \ sequence. Choose one of: %s' % (METHODS) assert isinstance(scale_els, bool), 'scale_els argument must be passed\ as a bool.' assert isinstance(model_type, str), 'model_type argument must be passed\ as a string.' assert model_type in MODELS, 'Must specify model_type as one of the\ following: %s' % (MODELS) # Functionality # Open input file infile = smart_open(input_seqs, 'r') # Initialize output lists, preallocating dimensions for speed. num_seqs, len_seq = organize.get_num_and_len_of_seqs_from_file(input_seqs) encoded_seqs = np.zeros((int(num_seqs), int(len_seq), 5)).astype(int) exp_levels = np.zeros(int(num_seqs)) # Encode sequences line_number = -3 for line in infile: line_number += 1 if line_number < 0: continue # skip first 2 lines of the file line = check_valid_line(line) if line == 'skip_line': continue # skip line if not a valid line seq, exp_level = separate_seq_and_el_data(line) # Encode with One-Hot method if method == 'One-Hot': try: encoded_seq = one_hot_encode_sequence(seq) except Exception: raise AssertionError('Error on line %s' % (line_number)) # Encode with another method, i.e. embedding else: # Another encoding method will go here # encoded_seq = another_encoding_method(seq) pass # Assign encoded sequences and expression levels to output arrays encoded_seqs[line_number] = encoded_seq exp_levels[line_number] = exp_level # Close the input file infile.close() # Reshape array if needed as input to LSTM model if model_type == 'LSTM': encoded_seqs = encoded_seqs.reshape(int(num_seqs), -1) encoded_seqs = encoded_seqs.reshape(int(num_seqs), 1, (int(len_seq) * 5)) # Scale expression level values to between -1 and 1 if scale_els: abs_max_el = abs(max(exp_levels, key=abs)) # the absolute max value # numpy allows easy division of all elements at once exp_levels = exp_levels / abs_max_el else: # If no scaling required abs_max_el = None # If expression levels are binarized, convert them from float ---> int if binarized_els: exp_levels = exp_levels.astype(int) return encoded_seqs, exp_levels, abs_max_el
def remove_flanks_from_all_seqs(input_seqs, scaffold_type='pTpA'): """ Removes all of the flanking sequences from an input file of sequences and their expression levels (tab separated). Example input file: GSE104878_20160609_average_promoter_ELs_per_seq_pTpA_ALL. shuffled.txt.gz from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104878 Args: ----- input_seqs (str) -- the absolute pathname of the file containing all of the input sequences and their expression levels (tab separated). scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) that the input sequences had their expression levels measured in. Returns: ----- out_abs_path (str) -- the absolute path for the output file containing all of the sequences with their flanks removed, along with their expression levels (tab separated). """ # Assertions assert isinstance(input_seqs, str), 'Input file pathname must be \ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' assert isinstance(scaffold_type, str), 'Scaffold type must be passed \ as a string.' assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Input \ scaffold type must be either pTpA or Abf1TATA.' # Check that all of the flank sequences are the same in all # sequences in the input file. incorrect = organize.check_oligonucleotide_flanks(input_seqs, scaffold_type) assert len(incorrect) == 0, 'Not all sequences in input file have same \ flanking sequences. Error on line %s' % str(incorrect) # Functionality # Defining the pathname for the output file. time_stamp = get_time_stamp() # Get unique time stamp for file naming relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' + scaffold_type + '_seqs_flanks_removed.txt') absolute_path = os.path.join(ROOT_DIR, relative_path) # Opening the input and output files. infile = smart_open(input_seqs, 'r') outfile = smart_open(absolute_path, 'w') # Remove flanks and write data to output file. for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) deflanked_seq = remove_flanks_from_seq(seq, scaffold_type) outfile.write(deflanked_seq + '\t' + str(exp_level) + '\n') # Close the input and output files. infile.close() outfile.close() return absolute_path
def pad_sequences(input_seqs, pad_front=False, extra_padding=0): """ Pads sequences in an input file to the length of the longest sequence in the file, plus any extra padding if specified. Pads the sequences at either the front or the back, with 'N' characters. Args: ----- input_seqs (str) -- the absolute path of the input file containing the sequences to be padded and their associated expression levels, tab separated. pad_front (bool) -- If True, will add padding to the front of the sequences. If False (default) pads sequences at the end (i.e. the RHS of the sequences). extra_padding (int) -- The number of extra null bases to add onto the front/back of the sequence Returns: ----- absolute_path (str) -- the absolute path of the output file containing all of the padded sequences and their associated expression levels, tab separated. """ # Assertions assert isinstance(input_seqs, str), 'Pathname of input file must be \ passed as a string.' assert os.path.exists(input_seqs), 'File does not exist.' assert isinstance(pad_front, bool), 'The pad_front variable must be \ passed as a bool.' assert isinstance(extra_padding, int), 'The amount of extra padding must \ be passed as an integer.' assert extra_padding >= 0, 'The amount of extra padding must be passed as \ a non-negative integer.' # Functionality # Define and open the output file absolute_path = input_seqs.replace('.txt', '_padded.txt') outfile = smart_open(absolute_path, 'w') # Retrieve input sequences, pad them, and write them to output file max_length, _, _ = organize.get_max_min_mode_length_of_seqs(input_seqs) pad_length = max_length + extra_padding with smart_open(input_seqs) as f: for line in f: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) difference = pad_length - len(seq) if difference == 0: # No need for padding padded_seq = seq else: # Need to pad padding_seq = 'P' * difference if pad_front: padded_seq = padding_seq + seq else: # pad the end of the sequence padded_seq = seq + padding_seq outfile.write(padded_seq + '\t' + str(exp_level) + '\n') # Close the output file outfile.close() return absolute_path
def insert_all_seq_into_one_scaffold(input_seqs, scaffold_type='pTpA'): """ Takes an input file containing N sequences and inserts them into a single scaffold sequence, outputting the N unique promoter sequences to an output file along with their expression levels (tab separated). Args: ----- input_seqs (str) -- the absolute path for the input file containing all the oligonucleotide sequences to be inserted into the single scaffold sequence. All sequences must be of the same length as the scaffold variable region. scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) that the input sequences had their expression levels measured in. Default: 'pTpA'. Returns: ----- absolute_path (str) -- the absolute path for the output file containing all of the complete promoter sequences (where each input sequence has been inserted into the scaffold sequence). """ # Assertions assert isinstance(input_seqs, str), 'TypeError: pathname for input file \ must be a string.' assert isinstance(scaffold_type, str), 'Scaffold type must be passed as \ a string.' assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \ type must either be passed as "pTpA" or "Abf1TATA".' # Functionality time_stamp = get_time_stamp() # get time stamp for unique file naming relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' + scaffold_type + '_seqs_inserted_into_scaffold.txt') absolute_path = os.path.join(ROOT_DIR, relative_path) # Open input and output files infile = smart_open(input_seqs, 'r') outfile = smart_open(absolute_path, 'w') # Retrieve the scaffold sequence scaff_directory = 'example/' + scaffold_type + '_data/' scaff_rel_path = scaff_directory + scaffold_type + '_scaffold.txt' scaff_abs_path = os.path.join(ROOT_DIR, scaff_rel_path) scaff_file = smart_open(scaff_abs_path, 'r') scaffold = scaff_file.readline().replace('\n', '') # Insert sequences into scaffold and write data to output file for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) complete_seq = insert_seq_into_scaffold(seq, scaffold) outfile.write(complete_seq + '\t' + str(exp_level) + '\n') # Close the input, output, and scaffold files. infile.close() outfile.close() scaff_file.close() return absolute_path
def process_raw_data(input_seqs, scaffold_type=None, percentile=None, binarize_els=True, homogeneous=False, deflank=True, insert_into_scaffold=True, extra_padding=0, pad_front=False, report_loss=True, report_times=True, remove_files=True, create_sample_of_size=None): """ A wrapper function that: Takes raw data as retrieved from Carl de Boer's publication at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104878, and processes the sequences according to the custom arguments, pads them to same length, and writes them to an output file along with their expression levels (tab separated). The end of the file contains comments specifying the number of sequences in the file and the lengths of the padded sequences. Args: ----- input_seqs (str) -- the absolute pathname of the file that contains all of the input sequences and their expression levels (tab separated). scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) that the input sequences had their expression levels measured in. percentile (float) -- the proportion of the raw input data to extract from the sequences with the highest and lowest expression levels. i.e if 'percentile=0.1' then the top 10 % of sequences with highest expression levels, and the bottom 10 % of sequences with lowest expression levels will be extracted from the raw input data. The resulting data file will contain ~ 20 % of the data as the raw input data. binarize_els (bool) -- if (and only if) a 'percentile' value is passed, this argument determines whether the expression level values (Els) will be binarized or not. If True (defualt), sequences with ELs in the top percentile will have their ELs binarized to 1, and sequences with ELs in the bottom percentile will have their ELs binarized to 0. homogeneous (bool) -- if True, only sequences of modal length will be processed. If False, all sequences will be processed regardless of length. Default: False. deflank (bool) -- if True, removes the constant flanking regions of the input sequences. Default: True. insert_into_scaffold (bool) -- if True inserts the input sequences into the appropriate scaffold. If False, the sequences are encoded as they are. Default: True. extra_padding (int) -- the number of 'P' characters greater than the maximum sequence length to pad each sequence to. Default: 0. pad_front (bool) -- whether to pad out the front (left hand side) or end (right hand side) of the sequences. If True, pads the front. Default: False (will pad the end). report_loss (bool) -- if True, reports the number of lines of data lost at each step in the process. Default: False. report_times (bool) -- if True, reports the time each step in the cleaning process takes. Default: False. remove_files (bool) -- if True, will remove intermediate files created in the process of processing raw data. Default: False (i.e. intermediary files will be kept). create_sample_of_size (int) -- if a number is passed, a sample of this size will be taken by pseudo-random from the file containing processed data, and written to a separate file. Returns: ----- processed_data (str) -- the absolute path for the file containing processed sequences along with their expression levels. """ # Assertions assert isinstance(input_seqs, str), ('Input file path name must be ' 'passed as a string.') assert os.path.exists(input_seqs), 'Input file does not exist.' assert isinstance(scaffold_type, str), ('Scaffold type must be passed as ' 'a string if specified.') assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \ type must be specified as either "pTpA" or "Abf1TATA".' assert isinstance(percentile, (float, type(None))), ('The "percentile" ' 'argument must be ' 'passed as a float.') if percentile is not None: assert percentile < 0.5, '"percentile" must be less that 0.5' assert isinstance(homogeneous, bool), ('The homogeneous argument must be ' 'passed as a bool.') assert isinstance(deflank, bool), ('The deflank argument must be passed ' 'as a bool.') assert isinstance(insert_into_scaffold, bool), ('insert_into_scaffold ' 'argument must be passed ' 'as a bool.') assert isinstance(extra_padding, int), ('The number of extra vectors to ' 'pad each sequence by should be ' 'passed as an integer.') assert extra_padding >= 0, ('extra_padding must be passed as a non-' 'negative integer.') assert isinstance(pad_front, bool), ('The pad_front argument must be ' 'passed as a bool.') assert isinstance(report_loss, bool), ('The report_loss argument must be ' 'passed as a bool.') assert isinstance(report_times, bool), ('The report_times argument must ' 'be passed as a bool.') assert isinstance(remove_files, bool), ('The remove_files argument must ' 'be passed as a bool.') if create_sample_of_size is not None: assert isinstance(create_sample_of_size, int), ('Sample size must be ' 'passed as an int') # Functionality print('Starting processing of raw data...') raw_data = input_seqs # Define final output file path time_stamp = get_time_stamp() relative_path = 'example/processed_data/' + time_stamp processed_data = os.path.join(ROOT_DIR, relative_path) # Create log file to write reports to if report_loss or report_times: report = smart_open(processed_data + '_process_report' + '.txt', 'w') # Initialize custom operations if specified (i.e loss + timing reports) if report_loss: loss_report = {} loss_report['Raw Data'] = get_seq_count(input_seqs) if report_times: t_init = t.time() t0 = t_init if remove_files: created_files = [] # keep track of the intermediate files created. # Pull out the top and bottom percentiles of data if percentile is not None: print('Pulling out the top and bottom percentiles...') df = organize.sort_by_exp_level(input_seqs) df = organize.discard_mid_data(df, percentile=percentile) processed_data += '_percentiles' if binarize_els: print('Binarizing expression levels...') df = organize.binarize_data(df) processed_data += '_els_binarized' input_seqs = organize.write_df_to_file(df) if report_loss: loss_report['Percentile Seqs'] = get_seq_count(input_seqs) if report_times: t1 = t.time() text = '\tFile created in %s s' % (t1 - t0) print(text) report.write('Top & bottom percentiles pulled...\n' + text + '\n') t0 = t1 if remove_files: created_files.append(input_seqs) # Create new file of only homogeneous (same length) seqs if homogeneous: print('Pulling homogeneous sequences from input file...') input_seqs = organize.pull_homogeneous_seqs(input_seqs, scaffold_type) processed_data += '_homogeneous' if report_loss: loss_report['Homogeneous Seqs'] = get_seq_count(input_seqs) if report_times: t1 = t.time() text = '\tFile created in %s s' % (t1 - t0) print(text) report.write('Homogeneous sequences pulled...\n' + text + '\n') t0 = t1 if remove_files: created_files.append(input_seqs) # Remove all of the flanking regions from the input sequences if deflank: print('Removing flank regions from sequences...') input_seqs = build.remove_flanks_from_all_seqs(input_seqs, scaffold_type) processed_data += '_deflanked' if report_loss: loss_report['Deflanked Seqs'] = get_seq_count(input_seqs) if report_times: t1 = t.time() text = '\tFile created in %s s' % (t1 - t0) print(text) report.write('Sequences deflanked...\n' + text + '\n') t0 = t1 if remove_files: created_files.append(input_seqs) processed_data += '_sequences' # Insert sequences into appropriate scaffold if insert_into_scaffold: print('Inserting sequences into %s scaffold...' % (scaffold_type)) input_seqs = build.insert_all_seq_into_one_scaffold( input_seqs, scaffold_type) processed_data += '_inserted_into_%s_scaffold' % (scaffold_type) if report_loss: loss_report['Scaffold-Inserted Seqs'] = get_seq_count(input_seqs) if report_times: t1 = t.time() text = '\tFile created in %s s' % (t1 - t0) print(text) report.write('Seqs inserted into ' + scaffold_type + 'scaffold...\n') report.write(text + '\n') t0 = t1 if remove_files: created_files.append(input_seqs) # Pad sequences if homogeneous and extra_padding == 0: pass else: print('Padding sequences...') input_seqs = build.pad_sequences(input_seqs, pad_front=pad_front, extra_padding=extra_padding) if not homogeneous: # then they will have been padded processed_data += '_padded_at' if pad_front: processed_data += '_front' else: processed_data += '_back' if extra_padding != 0: processed_data += '_%s_extra' % (extra_padding) if report_loss: loss_report['Padded Seqs'] = get_seq_count(input_seqs) if report_times: t1 = t.time() text = '\tFile created in %s s' % (t1 - t0) print(text) report.write('Padded sequences...\n') report.write(text + '\n') t0 = t1 # Remove intermediate files created in the process if remove_files: created_files.append(input_seqs) # Rename the final output file to reflect how data has been cleaned. processed_data += '_with_exp_levels.txt' # Report end of process and print final output file locations. if input_seqs != raw_data: # i.e. if data has been processed in some way os.rename(input_seqs, processed_data) # Report end of process and print absolute path of processed data. text = ('\nRaw data successfully processed.\nLocation: %s\n' % (processed_data)) print(text) if report_loss or report_times: report.write(text) else: # If no processing was performed. text = '\nNo processing performed.\n' text += 'Change processing specifications and try again.' print(text) report.write(text + '\n') text = 'Raw data remains unchanged.' print(text) report.write(text + '\n') text = 'Location : %s' % (raw_data) print(text) if report_loss or report_times: report.write(text + '\n') # Write the number of seqs and length of seqs to the start of file organize.write_num_and_len_of_seqs_to_file(processed_data) # Report loss if report_loss: report.write('\nLine counts at each step of the process:\n') for category in loss_report.keys(): curr_count = loss_report[category] if category == 'Raw Data': report.write('\t%s : %s\n' % (category, curr_count)) prev_count = curr_count else: report.write('\t%s : %s (%s lines lost since last step)\n' % (category, curr_count, (prev_count - curr_count))) prev_count = curr_count # Remove intermediate files if remove_files: print('\nRemoving intermediate files...') organize.remove_file_list(created_files) print('Files successfully removed.') print('Process complete.') # Report total time taken if report_times: t_final = t.time() text = '\nTotal processing time : %s s' % (t_final - t_init) print(text) report.write(text) print('Please find the process report in the same directory as the' ' output file for reports of data losses and timings.') if report_times or report_loss: report.close() # Create sample data if create_sample_of_size is not None: size = create_sample_of_size print('\n\nCreating sample of size %s ...' % str(size)) sample_seqs = organize.create_sample_data(processed_data, size) print('\nSample data successfully created.') print('\nLocation: %s \n' % (sample_seqs)) return processed_data