Exemple #1
0
def create_sample_data(input_seqs, sample_size):
    """
    Takes a sample of size 'sample_size' from an input file
    containing sequences and their associated expression levels,
    and writes them to a separate file. The format of the first
    2 lines of the resulting output file will be of the format:
    "
    number_of_seqs_in_file\t<###>
    length_of_each_sequence\t<$$$>
    "
    where '<###>' is the number of sequences in the file, and
    '<$$$>'is the length to which every sequence in the file is
    padded.

    Args:
    -----
        input_seqs (str) -- the absolute path of the input file
        containing sequence and expression level data to sample.

        sample_size (int) -- the number of samples to take from
        the input file.

    Returns:
    -----
        sample_data (str) -- the absolute path of the output file
        containing the sample of sequence and expression level
        data.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Input sequences file path must be\
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    assert isinstance(sample_size, int), 'Number of sequences to sample must\
    be passed as an integer.'

    assert sample_size < get_seq_count(input_seqs), 'Sample size must be\
    smaller than the number of sequences in the input file.'

    # Functionality
    # Define output file path
    index = input_seqs.rfind('/') + 1
    insert = str(sample_size) + '_from_'
    sample_seqs = input_seqs[:index] + insert + input_seqs[index:]
    # Pull sequences to create sample data
    with smart_open(input_seqs, 'r') as inf:
        inf.readline()
        inf.readline()  # skip the first 2 info lines
        all_lines = inf.readlines()
        for i in range(50):
            lines = random.sample(all_lines, sample_size)
    with smart_open(sample_seqs, 'w') as g:
        for line in lines:
            g.write(line)
    # Write number and length of sequence info to top of resulting file
    write_num_and_len_of_seqs_to_file(sample_seqs)

    return sample_seqs
Exemple #2
0
def write_num_and_len_of_seqs_to_file(input_seqs):
    """
    Prepends the number of sequences and the length of the
    sequences in an input file to the first 2 lines of the
    file. Assumes sequences have been processed so that all
    sequences have been padded to the same length. The first
    2 lines of the input file will be in the following format
    after writing the info to the file:
    "
    number_of_seqs_in_file\t<###>\n
    length_of_each_sequence\t<$$$>\n
    "
    where '<###>' is the number of sequences in the file, and
    '<$$$>'is the length to which every sequence in the file is
    padded.

    Args:
    -----
        input_seqs (str) -- the absolute path of the processed
        input sequences to extract information from.

    Returns:
    -----
        None
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Absolute pathname must be passed\
    as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    num_seqs = get_seq_count(input_seqs)
    with smart_open(input_seqs, 'r') as f:
        line = check_valid_line(f.readline())
        if line == 'skip_line':
            raise AssertionError('First line is not valid.')
        seq, _ = separate_seq_and_el_data(line)
        len_seqs = len(seq)  # assumes all sequences padded to same length
    with smart_open(input_seqs, 'r+') as f:
        contents = f.read()
    with smart_open(input_seqs, 'w+') as f:
        line_to_append = 'number_of_seqs_in_file\t' + str(num_seqs) + '\n'
        line_to_append += 'length_of_each_sequence\t' + str(len_seqs) + '\n'
        if input_seqs.endswith('.gz'):
            line_to_append = line_to_append.encode()
        f.write(line_to_append + contents)

    return
Exemple #3
0
def get_max_min_mode_length_of_seqs(input_seqs):
    """
    Returns the maximum, minimum, and modal length of the sequences
    in a file containing input sequences.

    Args:
    -----
        input_seqs (str) -- the absolute path of the file
        containing the input sequences and their expression levels,
        tab separated.

    Returns:
    -----
        max_length (int) -- the length of the longest sequence in
        the input file.

        min_length (int) -- the length of the shortest sequence in
        the input file.

        modal_length (int) -- the most common sequence length of
        the sequences in the input file.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Path name for input file must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    infile = smart_open(input_seqs, 'r')
    seq_lengths = []
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        seq_lengths.append(len(seq))
    max_length = max(seq_lengths)
    min_length = min(seq_lengths)
    modal_length = max(set(seq_lengths), key=seq_lengths.count)
    # Close the input file.
    infile.close()

    return max_length, min_length, modal_length
Exemple #4
0
def sort_by_exp_level(input_seqs):
    """
    Given an input file of sequences tab separated with their
    associated expression levels, sorts the lines of the file
    by expression level, with the highest levels at the top of
    the file.

    Args:
    -----
        input_seqs (str) -- the absolute path of the input file
        containing sequences to be sorted by expression level.

    Returns:
    -----
        sorted_df (pandas.DataFrame) -- a data frame where rows
        are sorted in descending order based on expression level.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Path name for input file must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    with smart_open(input_seqs, 'r') as f:
        line = check_valid_line(f.readline())
        seq1, _ = separate_seq_and_el_data(line)
        line = check_valid_line(f.readline())
        seq2, _ = separate_seq_and_el_data(line)
        exp_seq1 = 'number_of_seqs_in_file'
        exp_seq2 = 'length_of_each_sequence'
        if seq1 == exp_seq1 and seq2 == exp_seq2:
            skip = 2
        else:
            skip = 0
    # Import data into a pandas data frame
    df = pd.read_csv(input_seqs, sep='\t', names=['seq', 'el'], skiprows=skip)
    # Sort it based on expression level
    sorted_df = df.sort_values('el', ascending=False)
    sorted_df = sorted_df.reset_index()
    sorted_df = sorted_df.drop(columns='index')

    return sorted_df
Exemple #5
0
def get_num_and_len_of_seqs_from_file(input_seqs):
    """
    Returns the number of sequences and length of sequences in an
    input file. Assumes sequences have been processed so that all
    sequences have been padded to the same length, and that the
    file containing the process sequences have the first 2 lines in
    the following format:
    "
    number_of_seqs_in_file\t<###>
    length_of_each_sequence\t<$$$>
    "
    where '<###>' is the number of sequences in the file, and
    '<$$$>'is the length to which every sequence in the file is
    padded.

    Args:
    -----
        input_seqs (str) -- the absolute path of the processed
        input sequences to extract information from.

    Returns:
    -----
        num_seqs (int) -- the number of sequences in input_seqs.

        len_seqs (int) -- the length of the all the padded
        sequences in the input file.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Absolute pathname must be passed\
    as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    with smart_open(input_seqs, 'r') as f:
        # Parse first line of file containing info about num of seqs in file
        first_line = check_valid_line(f.readline())
        assert first_line != 'skip_line', 'Invalid first line of file. Must\
        be of the form: "number_of_seqs_in_file\t<###>" where <###> is the\
        number of sequences in the file.'

        token, num_seqs = separate_seq_and_el_data(first_line)
        if num_seqs % 1 != 0:
            raise ValueError('Number of sequences on first line must be\
            an integer.')
        assert token == 'number_of_seqs_in_file', 'First line of the input\
        file must be of the form: "number_of_seqs_in_file\t<###>" where\
        <###> is the number of sequences in the file.'

        # Parse 2nd line of file containing info about length of seqs in file
        second_line = check_valid_line(f.readline())
        assert second_line != 'skip_line', 'Invalid second line of file.\
        Must be of the form: "length_of_each_sequence\t<###>" where <###> is\
        the length of every sequence in the file.'

        token, len_seqs = separate_seq_and_el_data(second_line)
        if len_seqs % 1 != 0:
            raise ValueError('Sequence length on second line must be an\
            integer.')
        assert token == 'length_of_each_sequence', 'Second line of the input\
        file must be of the form: "length_of_each_sequence\t<###>" where\
        <###> is the length of every sequence in the file. Assumes\
        homogeneity and/or padding of sequences.'

    return num_seqs, len_seqs
Exemple #6
0
def check_oligonucleotide_flanks(seq_infile, scaffold_type):
    """
    Checks that all the oligonucleotide sequences in an input file
    consist of the same sequences that flank the variable 80-mer
    sequence. i.e. all sequences in the input file should be of the
    form:
    TGCATTTTTTTCACATC-(variable region)-GTTACGGCTGTT
    Whereas the input sequences measured in the Abf1TATA scaffold
    will be of the form:
    TCACGCAGTATAGTTC-(variable region)-GGTTTATTGTTTATAAAAA
    These flanking sequences are for in-lab sequencing purposes only,
    so can be discarded when the 80-mer variable sequences are
    inserted into the a scaffold sequence.

    Args:
    -----
        seq_infile (str) -- the absolute path of the input file
        containing all of the oligonucleotide sequences to be
        checked, and their expression level values (tab separated).

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        in which the expression levels for the sequences in the
        input file were measured.

    Returns:
    -----
        incorrect_lines (list) -- returns a list of line numbers for
        for sequences that contain incorrect flank sequences.
    """
    # Assertions
    assert isinstance(seq_infile, str), 'Absolute pathname must be passed \
    as a string.'

    assert isinstance(scaffold_type, str), 'Scaffold type must be passed as a\
    string.'

    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \
    type must be specified as either pTpA or Abf1TATA.'

    # Functionality
    if scaffold_type == 'pTpA':
        flank_A = 'TGCATTTTTTTCACATC'
        flank_B = 'GGTTACGGCTGTT'
    elif scaffold_type == 'Abf1TATA':
        flank_A = 'TCACGCAGTATAGTTC'
        flank_B = 'GGTTTATTGTTTATAAAAA'
    infile = smart_open(seq_infile, 'r')
    line_number = 0
    incorrect_lines = []
    for line in infile:
        line_number += 1
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        if seq.startswith(flank_A) and seq.endswith(flank_B):
            pass
        else:
            incorrect_lines.append(line_number)

    return incorrect_lines
Exemple #7
0
def pull_homogeneous_seqs(input_seqs, scaffold_type=None):
    """
    Pulls all sequences of the modal length (i.e. 110 bp for pTpA-type
    sequences and 115 bp for Abf1TATA-type) from an input file and
    writes them into an output file.

    Args:
    -----
        input_seqs (str) -- the absolute pathname of the input file
        containing all of the raw oligonucleotide sequences and
        their expression levels, tab separated.

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA
        for which the modal length is known to be 110 and 115
        respectively) in which the expression levels for the
        sequences in the input file were measured. If None, the
        modal length is calculated manually. Default: None.

    Returns:
    -----
        absolute_path (str) -- the absolute pathname of the output
        file containing the sequences of modal length.
    """
    # Assertions
    assert isinstance(input_seqs, str), ('Input file pathname must be a'
                                         'string.')
    assert os.path.isfile(input_seqs), 'Input file does not exist!'
    assert isinstance(scaffold_type, (str, type(None))), 'Scaffold type must\
    be passed as a string.'

    if isinstance(scaffold_type, str):
        assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaff\
        type must be specified as either pTpA or Abf1TATA, or else\
        unspecified (in which case it takes value of None).'

    # Functionality
    # Defining the path name of the output file.
    relative_path = 'example/'
    time_stamp = get_time_stamp()
    if scaffold_type is None:
        relative_path += ('other_scaffolds/' + time_stamp +
                          '_homogeneous_seqs.txt')
    else:
        relative_path += (scaffold_type + '_data/' + time_stamp + '_' +
                          scaffold_type + '_homogeneous_seqs.txt')
    absolute_path = os.path.join(ROOT_DIR, relative_path)
    # Open the input and output files.
    infile = smart_open(input_seqs, 'r')
    output_seqs = smart_open(absolute_path, 'w')
    # Retrieve modal length for sequences in input file.
    if scaffold_type == 'pTpA':
        modal_length = 110
    elif scaffold_type == 'Abf1TATA':
        modal_length = 115
    else:
        _, _, modal_length = get_max_min_mode_length_of_seqs(input_seqs)
    # Find seqs in input file w/ modal length and write them to output file
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        if len(seq) == modal_length:
            output_seqs.write(seq + '\t' + str(exp_level) + '\n')
        else:
            continue
    # Close the input and output files.
    infile.close()
    output_seqs.close()

    return absolute_path
Exemple #8
0
def encode_sequences_with_method(input_seqs,
                                 method='One-Hot',
                                 scale_els=True,
                                 model_type='1DCNN',
                                 binarized_els=False):
    """
    A wrapper function that encodes all of the sequences in an
    input file according to the specified method, and returns
    them in a numpy array, as well as returning the associated
    expression levels in a separate numpy array.

    Args:
    -----
        input_seqs (str) -- absolute path of the file containing
        all of the input sequences to be encoded, tab-separated
        withtheir associated expression levels. The first line of
        the file must be the number of sequences in the file, of
        the format: "number_of_seqs_in_file\t<###>" where <###> is
        the number of sequences in the file. The second line in the
        file must be the length to which all sequences are padded,
        of the format: "length_of_each_sequence\t<###>" where <###>
        is the length of every sequence in the file. Assumes
        homogeneity and/or padding of sequences.

        method (str) -- the method by which the sequence should be
        encoded. Must choose from: 'One-Hot'. Default: 'One-Hot'

        scale_els (bool) -- if True (default), scales all of the
        expression levels in the output list exp_levels to between
        -1 and 1, corresponding to the min and max values
        respectively.

        model_type (str) -- the type of model being used. Controls
        the shape of the returned list that contains the encoded
        sequences. Must be one of: '1DCNN' (for 1D-convolutional
        net), '1DLOCCON' (for 1D-locally connected net), or 'LSTM'
        (for Long-Short-Term-Memory net).

    Returns:
    -----
        encoded_seqs (numpy.ndarray) -- a list of all the sequences
        in the input file, encoded with the specified method. Each
        element (i.e. each encoded sequence) is of type list. Shape
        of this array depends on 'model_type'. For example, for an
        input file containing 10000 sequences, each of length 257,
        where the length of each base vector is 5 (corresponding to
        bases A,T,G,C,N), the output shapes of encoded_seqs for each
        model is as follows:
        '1DCONV'   ===> (10000, 257, 5)
        '1DLOCCON' ===> (10000, 257, 5)
        'LSTM'     ===> (10000, 1, 1285) where 1285=257*5

        exp_levels (numpy.ndarray) -- a list of all the expression
        levels associated with the sequences. Each element (i.e.
        each EL) is of type float. Values scaled to between -1 and
        1 if argument 'scale_els=True'.

        abs_max_el (float) -- the maximum expression level value in the
        input file. Returned only if 'scale_els=True'.

    """
    # Assertions
    assert isinstance(input_seqs, str), 'TypeError: Input file path must be \
    passed as a string.'

    assert isinstance(method, str), 'TypeError: Specified method must be a \
    a string.'

    assert method in METHODS, 'Must specify one the method of encoding the \
    sequence. Choose one of: %s' % (METHODS)
    assert isinstance(scale_els, bool), 'scale_els argument must be passed\
    as a bool.'

    assert isinstance(model_type, str), 'model_type argument must be passed\
    as a string.'

    assert model_type in MODELS, 'Must specify model_type as one of the\
    following: %s' % (MODELS)
    # Functionality
    # Open input file
    infile = smart_open(input_seqs, 'r')
    # Initialize output lists, preallocating dimensions for speed.
    num_seqs, len_seq = organize.get_num_and_len_of_seqs_from_file(input_seqs)
    encoded_seqs = np.zeros((int(num_seqs), int(len_seq), 5)).astype(int)
    exp_levels = np.zeros(int(num_seqs))
    # Encode sequences
    line_number = -3
    for line in infile:
        line_number += 1
        if line_number < 0:
            continue  # skip first 2 lines of the file
        line = check_valid_line(line)
        if line == 'skip_line':
            continue  # skip line if not a valid line
        seq, exp_level = separate_seq_and_el_data(line)
        # Encode with One-Hot method
        if method == 'One-Hot':
            try:
                encoded_seq = one_hot_encode_sequence(seq)
            except Exception:
                raise AssertionError('Error on line %s' % (line_number))
        # Encode with another method, i.e. embedding
        else:
            # Another encoding method will go here
            # encoded_seq = another_encoding_method(seq)
            pass
        # Assign encoded sequences and expression levels to output arrays
        encoded_seqs[line_number] = encoded_seq
        exp_levels[line_number] = exp_level
    # Close the input file
    infile.close()
    # Reshape array if needed as input to LSTM model
    if model_type == 'LSTM':
        encoded_seqs = encoded_seqs.reshape(int(num_seqs), -1)
        encoded_seqs = encoded_seqs.reshape(int(num_seqs), 1,
                                            (int(len_seq) * 5))
    # Scale expression level values to between -1 and 1
    if scale_els:
        abs_max_el = abs(max(exp_levels, key=abs))  # the absolute max value
        # numpy allows easy division of all elements at once
        exp_levels = exp_levels / abs_max_el
    else:  # If no scaling required
        abs_max_el = None
    # If expression levels are binarized, convert them from float ---> int
    if binarized_els:
        exp_levels = exp_levels.astype(int)

    return encoded_seqs, exp_levels, abs_max_el
Exemple #9
0
def remove_flanks_from_all_seqs(input_seqs, scaffold_type='pTpA'):
    """
    Removes all of the flanking sequences from an input file of
    sequences and their expression levels (tab separated).
    Example input file:
    GSE104878_20160609_average_promoter_ELs_per_seq_pTpA_ALL.
    shuffled.txt.gz from
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104878

    Args:
    -----
        input_seqs (str) -- the absolute pathname of the file
        containing all of the input sequences and their expression
        levels (tab separated).

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        that the input sequences had their expression levels
        measured in.

    Returns:
    -----
        out_abs_path (str) -- the absolute path for the output file
        containing all of the sequences with their flanks removed,
        along with their expression levels (tab separated).
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Input file pathname must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    assert isinstance(scaffold_type, str), 'Scaffold type must be passed \
    as a string.'

    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Input \
    scaffold type must be either pTpA or Abf1TATA.'

    # Check that all of the flank sequences are the same in all
    # sequences in the input file.
    incorrect = organize.check_oligonucleotide_flanks(input_seqs,
                                                      scaffold_type)
    assert len(incorrect) == 0, 'Not all sequences in input file have same \
    flanking sequences. Error on line %s' % str(incorrect)
    # Functionality
    # Defining the pathname for the output file.
    time_stamp = get_time_stamp()  # Get unique time stamp for file naming
    relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' +
                     scaffold_type + '_seqs_flanks_removed.txt')
    absolute_path = os.path.join(ROOT_DIR, relative_path)
    # Opening the input and output files.
    infile = smart_open(input_seqs, 'r')
    outfile = smart_open(absolute_path, 'w')
    # Remove flanks and write data to output file.
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        deflanked_seq = remove_flanks_from_seq(seq, scaffold_type)
        outfile.write(deflanked_seq + '\t' + str(exp_level) + '\n')
    # Close the input and output files.
    infile.close()
    outfile.close()

    return absolute_path
Exemple #10
0
def pad_sequences(input_seqs, pad_front=False, extra_padding=0):
    """
    Pads sequences in an input file to the length of the longest
    sequence in the file, plus any extra padding if specified.
    Pads the sequences at either the front or the back, with 'N'
    characters.

    Args:
    -----
         input_seqs (str) -- the absolute path of the input file
        containing the sequences to be padded and their associated
        expression levels, tab separated.

        pad_front (bool) -- If True, will add padding to the front
        of the sequences. If False (default) pads sequences at the
        end (i.e. the RHS of the sequences).

        extra_padding (int) -- The number of extra null bases to
        add onto the front/back of the sequence

    Returns:
    -----
        absolute_path (str) -- the absolute path of the output file
        containing all of the padded sequences and their associated
        expression levels, tab separated.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Pathname of input file must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'File does not exist.'
    assert isinstance(pad_front, bool), 'The pad_front variable must be \
    passed as a bool.'

    assert isinstance(extra_padding, int), 'The amount of extra padding must \
    be passed as an integer.'

    assert extra_padding >= 0, 'The amount of extra padding must be passed as \
    a non-negative integer.'

    # Functionality
    # Define and open the output file
    absolute_path = input_seqs.replace('.txt', '_padded.txt')
    outfile = smart_open(absolute_path, 'w')
    # Retrieve input sequences, pad them, and write them to output file
    max_length, _, _ = organize.get_max_min_mode_length_of_seqs(input_seqs)
    pad_length = max_length + extra_padding
    with smart_open(input_seqs) as f:
        for line in f:
            line = check_valid_line(line)
            if line == 'skip_line':
                continue
            seq, exp_level = separate_seq_and_el_data(line)
            difference = pad_length - len(seq)
            if difference == 0:  # No need for padding
                padded_seq = seq
            else:  # Need to pad
                padding_seq = 'P' * difference
                if pad_front:
                    padded_seq = padding_seq + seq
                else:  # pad the end of the sequence
                    padded_seq = seq + padding_seq
            outfile.write(padded_seq + '\t' + str(exp_level) + '\n')
    # Close the output file
    outfile.close()

    return absolute_path
Exemple #11
0
def insert_all_seq_into_one_scaffold(input_seqs, scaffold_type='pTpA'):
    """
    Takes an input file containing N sequences and inserts them into
    a single scaffold sequence, outputting the N unique promoter
    sequences to an output file along with their expression levels
    (tab separated).

    Args:
    -----
        input_seqs (str) -- the absolute path for the input file
        containing all the oligonucleotide sequences to be inserted
        into the single scaffold sequence. All sequences must be of
        the same length as the scaffold variable region.

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        that the input sequences had their expression levels
        measured in. Default: 'pTpA'.

    Returns:
    -----
        absolute_path (str) -- the absolute path for the output file
        containing all of the complete promoter sequences (where each
        input sequence has been inserted into the scaffold sequence).
    """
    # Assertions
    assert isinstance(input_seqs, str), 'TypeError: pathname for input file \
    must be a string.'

    assert isinstance(scaffold_type, str), 'Scaffold type must be passed as \
    a string.'

    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \
    type must either be passed as "pTpA" or "Abf1TATA".'

    # Functionality
    time_stamp = get_time_stamp()  # get time stamp for unique file naming
    relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' +
                     scaffold_type + '_seqs_inserted_into_scaffold.txt')
    absolute_path = os.path.join(ROOT_DIR, relative_path)
    # Open input and output files
    infile = smart_open(input_seqs, 'r')
    outfile = smart_open(absolute_path, 'w')
    # Retrieve the scaffold sequence
    scaff_directory = 'example/' + scaffold_type + '_data/'
    scaff_rel_path = scaff_directory + scaffold_type + '_scaffold.txt'
    scaff_abs_path = os.path.join(ROOT_DIR, scaff_rel_path)
    scaff_file = smart_open(scaff_abs_path, 'r')
    scaffold = scaff_file.readline().replace('\n', '')
    # Insert sequences into scaffold and write data to output file
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        complete_seq = insert_seq_into_scaffold(seq, scaffold)
        outfile.write(complete_seq + '\t' + str(exp_level) + '\n')
    # Close the input, output, and scaffold files.
    infile.close()
    outfile.close()
    scaff_file.close()

    return absolute_path
Exemple #12
0
def process_raw_data(input_seqs,
                     scaffold_type=None,
                     percentile=None,
                     binarize_els=True,
                     homogeneous=False,
                     deflank=True,
                     insert_into_scaffold=True,
                     extra_padding=0,
                     pad_front=False,
                     report_loss=True,
                     report_times=True,
                     remove_files=True,
                     create_sample_of_size=None):
    """
    A wrapper function that:
    Takes raw data as retrieved from Carl de Boer's publication
    at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104878,
    and processes the sequences according to the custom arguments,
    pads them to same length, and writes them to an output file
    along with their expression levels (tab separated). The end of
    the file contains comments specifying the number of sequences
    in the file and the lengths of the padded sequences.

    Args:
    -----
        input_seqs (str) -- the absolute pathname of the file that
        contains all of the input sequences and their expression
        levels (tab separated).

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        that the input sequences had their expression levels
        measured in.

        percentile (float) -- the proportion of the raw input data
        to extract from the sequences with the highest and lowest
        expression levels. i.e if 'percentile=0.1' then the top
        10 % of sequences with highest expression levels, and the
        bottom 10 % of sequences with lowest expression levels will
        be extracted from the raw input data. The resulting data
        file will contain ~ 20 % of the data as the raw input data.

        binarize_els (bool) -- if (and only if) a 'percentile'
        value is passed, this argument determines whether the
        expression level values (Els) will be binarized or not. If
        True (defualt), sequences with ELs in the top percentile
        will have their ELs binarized to 1, and sequences with ELs
        in the bottom percentile will have their ELs binarized
        to 0.

        homogeneous (bool) -- if True, only sequences of modal
        length will be processed. If False, all sequences will be
        processed regardless of length. Default: False.

        deflank (bool) -- if True, removes the constant flanking
        regions of the input sequences. Default: True.

        insert_into_scaffold (bool) -- if True inserts the input
        sequences into the appropriate scaffold. If False, the
        sequences are encoded as they are. Default: True.

        extra_padding (int) -- the number of 'P' characters greater
        than the maximum sequence length to pad each sequence to.
        Default: 0.

        pad_front (bool) -- whether to pad out the front (left hand
        side) or end (right hand side) of the sequences. If True,
        pads the front. Default: False (will pad the end).

        report_loss (bool) -- if True, reports the number of lines
        of data lost at each step in the process. Default: False.

        report_times (bool) -- if True, reports the time each step
        in the cleaning process takes. Default: False.

        remove_files (bool) -- if True, will remove intermediate
        files created in the process of processing raw data.
        Default: False (i.e. intermediary files will be kept).

        create_sample_of_size (int) -- if a number is passed, a
        sample of this size will be taken by pseudo-random from
        the file containing processed data, and written to a
        separate file.

    Returns:
    -----
        processed_data (str) -- the absolute path for the file
        containing processed sequences along with their expression
        levels.
    """
    # Assertions
    assert isinstance(input_seqs, str), ('Input file path name must be '
                                         'passed as a string.')
    assert os.path.exists(input_seqs), 'Input file does not exist.'
    assert isinstance(scaffold_type, str), ('Scaffold type must be passed as '
                                            'a string if specified.')
    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \
    type must be specified as either "pTpA" or "Abf1TATA".'

    assert isinstance(percentile, (float, type(None))), ('The "percentile" '
                                                         'argument must be '
                                                         'passed as a float.')
    if percentile is not None:
        assert percentile < 0.5, '"percentile" must be less that 0.5'
    assert isinstance(homogeneous, bool), ('The homogeneous argument must be '
                                           'passed as a bool.')
    assert isinstance(deflank, bool), ('The deflank argument must be passed '
                                       'as a bool.')
    assert isinstance(insert_into_scaffold, bool), ('insert_into_scaffold '
                                                    'argument must be passed '
                                                    'as a bool.')
    assert isinstance(extra_padding, int), ('The number of extra vectors to '
                                            'pad each sequence by should be '
                                            'passed as an integer.')
    assert extra_padding >= 0, ('extra_padding must be passed as a non-'
                                'negative integer.')
    assert isinstance(pad_front, bool), ('The pad_front argument must be '
                                         'passed as a bool.')
    assert isinstance(report_loss, bool), ('The report_loss argument must be '
                                           'passed as a bool.')
    assert isinstance(report_times, bool), ('The report_times argument must '
                                            'be passed as a bool.')
    assert isinstance(remove_files, bool), ('The remove_files argument must '
                                            'be passed as a bool.')
    if create_sample_of_size is not None:
        assert isinstance(create_sample_of_size, int), ('Sample size must be '
                                                        'passed as an int')
    # Functionality
    print('Starting processing of raw data...')
    raw_data = input_seqs
    # Define final output file path
    time_stamp = get_time_stamp()
    relative_path = 'example/processed_data/' + time_stamp
    processed_data = os.path.join(ROOT_DIR, relative_path)
    # Create log file to write reports to
    if report_loss or report_times:
        report = smart_open(processed_data + '_process_report' + '.txt', 'w')
    # Initialize custom operations if specified (i.e loss + timing reports)
    if report_loss:
        loss_report = {}
        loss_report['Raw Data'] = get_seq_count(input_seqs)
    if report_times:
        t_init = t.time()
        t0 = t_init
    if remove_files:
        created_files = []  # keep track of the intermediate files created.
    # Pull out the top and bottom percentiles of data
    if percentile is not None:
        print('Pulling out the top and bottom percentiles...')
        df = organize.sort_by_exp_level(input_seqs)
        df = organize.discard_mid_data(df, percentile=percentile)
        processed_data += '_percentiles'
        if binarize_els:
            print('Binarizing expression levels...')
            df = organize.binarize_data(df)
            processed_data += '_els_binarized'
        input_seqs = organize.write_df_to_file(df)
        if report_loss:
            loss_report['Percentile Seqs'] = get_seq_count(input_seqs)
        if report_times:
            t1 = t.time()
            text = '\tFile created in %s s' % (t1 - t0)
            print(text)
            report.write('Top & bottom percentiles pulled...\n' + text + '\n')
            t0 = t1
        if remove_files:
            created_files.append(input_seqs)
    # Create new file of only homogeneous (same length) seqs
    if homogeneous:
        print('Pulling homogeneous sequences from input file...')
        input_seqs = organize.pull_homogeneous_seqs(input_seqs, scaffold_type)
        processed_data += '_homogeneous'
        if report_loss:
            loss_report['Homogeneous Seqs'] = get_seq_count(input_seqs)
        if report_times:
            t1 = t.time()
            text = '\tFile created in %s s' % (t1 - t0)
            print(text)
            report.write('Homogeneous sequences pulled...\n' + text + '\n')
            t0 = t1
        if remove_files:
            created_files.append(input_seqs)
    # Remove all of the flanking regions from the input sequences
    if deflank:
        print('Removing flank regions from sequences...')
        input_seqs = build.remove_flanks_from_all_seqs(input_seqs,
                                                       scaffold_type)
        processed_data += '_deflanked'
        if report_loss:
            loss_report['Deflanked Seqs'] = get_seq_count(input_seqs)
        if report_times:
            t1 = t.time()
            text = '\tFile created in %s s' % (t1 - t0)
            print(text)
            report.write('Sequences deflanked...\n' + text + '\n')
            t0 = t1
        if remove_files:
            created_files.append(input_seqs)
    processed_data += '_sequences'
    # Insert sequences into appropriate scaffold
    if insert_into_scaffold:
        print('Inserting sequences into %s scaffold...' % (scaffold_type))
        input_seqs = build.insert_all_seq_into_one_scaffold(
            input_seqs, scaffold_type)
        processed_data += '_inserted_into_%s_scaffold' % (scaffold_type)
        if report_loss:
            loss_report['Scaffold-Inserted Seqs'] = get_seq_count(input_seqs)
        if report_times:
            t1 = t.time()
            text = '\tFile created in %s s' % (t1 - t0)
            print(text)
            report.write('Seqs inserted into ' + scaffold_type +
                         'scaffold...\n')
            report.write(text + '\n')
            t0 = t1
        if remove_files:
            created_files.append(input_seqs)
    # Pad sequences
    if homogeneous and extra_padding == 0:
        pass
    else:
        print('Padding sequences...')
        input_seqs = build.pad_sequences(input_seqs,
                                         pad_front=pad_front,
                                         extra_padding=extra_padding)
    if not homogeneous:  # then they will have been padded
        processed_data += '_padded_at'
        if pad_front:
            processed_data += '_front'
        else:
            processed_data += '_back'
    if extra_padding != 0:
        processed_data += '_%s_extra' % (extra_padding)
    if report_loss:
        loss_report['Padded Seqs'] = get_seq_count(input_seqs)
    if report_times:
        t1 = t.time()
        text = '\tFile created in %s s' % (t1 - t0)
        print(text)
        report.write('Padded sequences...\n')
        report.write(text + '\n')
        t0 = t1
    # Remove intermediate files created in the process
    if remove_files:
        created_files.append(input_seqs)
    # Rename the final output file to reflect how data has been cleaned.
    processed_data += '_with_exp_levels.txt'
    # Report end of process and print final output file locations.
    if input_seqs != raw_data:  # i.e. if data has been processed in some way
        os.rename(input_seqs, processed_data)
        # Report end of process and print absolute path of processed data.
        text = ('\nRaw data successfully processed.\nLocation: %s\n' %
                (processed_data))
        print(text)
        if report_loss or report_times:
            report.write(text)
    else:  # If no processing was performed.
        text = '\nNo processing performed.\n'
        text += 'Change processing specifications and try again.'
        print(text)
        report.write(text + '\n')
        text = 'Raw data remains unchanged.'
        print(text)
        report.write(text + '\n')
        text = 'Location : %s' % (raw_data)
        print(text)
        if report_loss or report_times:
            report.write(text + '\n')
    # Write the number of seqs and length of seqs to the start of file
    organize.write_num_and_len_of_seqs_to_file(processed_data)
    # Report loss
    if report_loss:
        report.write('\nLine counts at each step of the process:\n')
        for category in loss_report.keys():
            curr_count = loss_report[category]
            if category == 'Raw Data':
                report.write('\t%s : %s\n' % (category, curr_count))
                prev_count = curr_count
            else:
                report.write('\t%s : %s (%s lines lost since last step)\n' %
                             (category, curr_count, (prev_count - curr_count)))
                prev_count = curr_count
    # Remove intermediate files
    if remove_files:
        print('\nRemoving intermediate files...')
        organize.remove_file_list(created_files)
        print('Files successfully removed.')
    print('Process complete.')
    # Report total time taken
    if report_times:
        t_final = t.time()
        text = '\nTotal processing time : %s s' % (t_final - t_init)
        print(text)
        report.write(text)
        print('Please find the process report in the same directory as the'
              ' output file for reports of data losses and timings.')
    if report_times or report_loss:
        report.close()
    # Create sample data
    if create_sample_of_size is not None:
        size = create_sample_of_size
        print('\n\nCreating sample of size %s ...' % str(size))
        sample_seqs = organize.create_sample_data(processed_data, size)
        print('\nSample data successfully created.')
        print('\nLocation: %s \n' % (sample_seqs))

    return processed_data