Esempio n. 1
0
def sort_by_exp_level(input_seqs):
    """
    Given an input file of sequences tab separated with their
    associated expression levels, sorts the lines of the file
    by expression level, with the highest levels at the top of
    the file.

    Args:
    -----
        input_seqs (str) -- the absolute path of the input file
        containing sequences to be sorted by expression level.

    Returns:
    -----
        sorted_df (pandas.DataFrame) -- a data frame where rows
        are sorted in descending order based on expression level.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Path name for input file must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    with smart_open(input_seqs, 'r') as f:
        line = check_valid_line(f.readline())
        seq1, _ = separate_seq_and_el_data(line)
        line = check_valid_line(f.readline())
        seq2, _ = separate_seq_and_el_data(line)
        exp_seq1 = 'number_of_seqs_in_file'
        exp_seq2 = 'length_of_each_sequence'
        if seq1 == exp_seq1 and seq2 == exp_seq2:
            skip = 2
        else:
            skip = 0
    # Import data into a pandas data frame
    df = pd.read_csv(input_seqs, sep='\t', names=['seq', 'el'], skiprows=skip)
    # Sort it based on expression level
    sorted_df = df.sort_values('el', ascending=False)
    sorted_df = sorted_df.reset_index()
    sorted_df = sorted_df.drop(columns='index')

    return sorted_df
Esempio n. 2
0
def write_num_and_len_of_seqs_to_file(input_seqs):
    """
    Prepends the number of sequences and the length of the
    sequences in an input file to the first 2 lines of the
    file. Assumes sequences have been processed so that all
    sequences have been padded to the same length. The first
    2 lines of the input file will be in the following format
    after writing the info to the file:
    "
    number_of_seqs_in_file\t<###>\n
    length_of_each_sequence\t<$$$>\n
    "
    where '<###>' is the number of sequences in the file, and
    '<$$$>'is the length to which every sequence in the file is
    padded.

    Args:
    -----
        input_seqs (str) -- the absolute path of the processed
        input sequences to extract information from.

    Returns:
    -----
        None
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Absolute pathname must be passed\
    as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    num_seqs = get_seq_count(input_seqs)
    with smart_open(input_seqs, 'r') as f:
        line = check_valid_line(f.readline())
        if line == 'skip_line':
            raise AssertionError('First line is not valid.')
        seq, _ = separate_seq_and_el_data(line)
        len_seqs = len(seq)  # assumes all sequences padded to same length
    with smart_open(input_seqs, 'r+') as f:
        contents = f.read()
    with smart_open(input_seqs, 'w+') as f:
        line_to_append = 'number_of_seqs_in_file\t' + str(num_seqs) + '\n'
        line_to_append += 'length_of_each_sequence\t' + str(len_seqs) + '\n'
        if input_seqs.endswith('.gz'):
            line_to_append = line_to_append.encode()
        f.write(line_to_append + contents)

    return
Esempio n. 3
0
def get_max_min_mode_length_of_seqs(input_seqs):
    """
    Returns the maximum, minimum, and modal length of the sequences
    in a file containing input sequences.

    Args:
    -----
        input_seqs (str) -- the absolute path of the file
        containing the input sequences and their expression levels,
        tab separated.

    Returns:
    -----
        max_length (int) -- the length of the longest sequence in
        the input file.

        min_length (int) -- the length of the shortest sequence in
        the input file.

        modal_length (int) -- the most common sequence length of
        the sequences in the input file.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Path name for input file must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    infile = smart_open(input_seqs, 'r')
    seq_lengths = []
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        seq_lengths.append(len(seq))
    max_length = max(seq_lengths)
    min_length = min(seq_lengths)
    modal_length = max(set(seq_lengths), key=seq_lengths.count)
    # Close the input file.
    infile.close()

    return max_length, min_length, modal_length
Esempio n. 4
0
def get_num_and_len_of_seqs_from_file(input_seqs):
    """
    Returns the number of sequences and length of sequences in an
    input file. Assumes sequences have been processed so that all
    sequences have been padded to the same length, and that the
    file containing the process sequences have the first 2 lines in
    the following format:
    "
    number_of_seqs_in_file\t<###>
    length_of_each_sequence\t<$$$>
    "
    where '<###>' is the number of sequences in the file, and
    '<$$$>'is the length to which every sequence in the file is
    padded.

    Args:
    -----
        input_seqs (str) -- the absolute path of the processed
        input sequences to extract information from.

    Returns:
    -----
        num_seqs (int) -- the number of sequences in input_seqs.

        len_seqs (int) -- the length of the all the padded
        sequences in the input file.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Absolute pathname must be passed\
    as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    # Functionality
    with smart_open(input_seqs, 'r') as f:
        # Parse first line of file containing info about num of seqs in file
        first_line = check_valid_line(f.readline())
        assert first_line != 'skip_line', 'Invalid first line of file. Must\
        be of the form: "number_of_seqs_in_file\t<###>" where <###> is the\
        number of sequences in the file.'

        token, num_seqs = separate_seq_and_el_data(first_line)
        if num_seqs % 1 != 0:
            raise ValueError('Number of sequences on first line must be\
            an integer.')
        assert token == 'number_of_seqs_in_file', 'First line of the input\
        file must be of the form: "number_of_seqs_in_file\t<###>" where\
        <###> is the number of sequences in the file.'

        # Parse 2nd line of file containing info about length of seqs in file
        second_line = check_valid_line(f.readline())
        assert second_line != 'skip_line', 'Invalid second line of file.\
        Must be of the form: "length_of_each_sequence\t<###>" where <###> is\
        the length of every sequence in the file.'

        token, len_seqs = separate_seq_and_el_data(second_line)
        if len_seqs % 1 != 0:
            raise ValueError('Sequence length on second line must be an\
            integer.')
        assert token == 'length_of_each_sequence', 'Second line of the input\
        file must be of the form: "length_of_each_sequence\t<###>" where\
        <###> is the length of every sequence in the file. Assumes\
        homogeneity and/or padding of sequences.'

    return num_seqs, len_seqs
Esempio n. 5
0
def check_oligonucleotide_flanks(seq_infile, scaffold_type):
    """
    Checks that all the oligonucleotide sequences in an input file
    consist of the same sequences that flank the variable 80-mer
    sequence. i.e. all sequences in the input file should be of the
    form:
    TGCATTTTTTTCACATC-(variable region)-GTTACGGCTGTT
    Whereas the input sequences measured in the Abf1TATA scaffold
    will be of the form:
    TCACGCAGTATAGTTC-(variable region)-GGTTTATTGTTTATAAAAA
    These flanking sequences are for in-lab sequencing purposes only,
    so can be discarded when the 80-mer variable sequences are
    inserted into the a scaffold sequence.

    Args:
    -----
        seq_infile (str) -- the absolute path of the input file
        containing all of the oligonucleotide sequences to be
        checked, and their expression level values (tab separated).

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        in which the expression levels for the sequences in the
        input file were measured.

    Returns:
    -----
        incorrect_lines (list) -- returns a list of line numbers for
        for sequences that contain incorrect flank sequences.
    """
    # Assertions
    assert isinstance(seq_infile, str), 'Absolute pathname must be passed \
    as a string.'

    assert isinstance(scaffold_type, str), 'Scaffold type must be passed as a\
    string.'

    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \
    type must be specified as either pTpA or Abf1TATA.'

    # Functionality
    if scaffold_type == 'pTpA':
        flank_A = 'TGCATTTTTTTCACATC'
        flank_B = 'GGTTACGGCTGTT'
    elif scaffold_type == 'Abf1TATA':
        flank_A = 'TCACGCAGTATAGTTC'
        flank_B = 'GGTTTATTGTTTATAAAAA'
    infile = smart_open(seq_infile, 'r')
    line_number = 0
    incorrect_lines = []
    for line in infile:
        line_number += 1
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        if seq.startswith(flank_A) and seq.endswith(flank_B):
            pass
        else:
            incorrect_lines.append(line_number)

    return incorrect_lines
Esempio n. 6
0
def pull_homogeneous_seqs(input_seqs, scaffold_type=None):
    """
    Pulls all sequences of the modal length (i.e. 110 bp for pTpA-type
    sequences and 115 bp for Abf1TATA-type) from an input file and
    writes them into an output file.

    Args:
    -----
        input_seqs (str) -- the absolute pathname of the input file
        containing all of the raw oligonucleotide sequences and
        their expression levels, tab separated.

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA
        for which the modal length is known to be 110 and 115
        respectively) in which the expression levels for the
        sequences in the input file were measured. If None, the
        modal length is calculated manually. Default: None.

    Returns:
    -----
        absolute_path (str) -- the absolute pathname of the output
        file containing the sequences of modal length.
    """
    # Assertions
    assert isinstance(input_seqs, str), ('Input file pathname must be a'
                                         'string.')
    assert os.path.isfile(input_seqs), 'Input file does not exist!'
    assert isinstance(scaffold_type, (str, type(None))), 'Scaffold type must\
    be passed as a string.'

    if isinstance(scaffold_type, str):
        assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaff\
        type must be specified as either pTpA or Abf1TATA, or else\
        unspecified (in which case it takes value of None).'

    # Functionality
    # Defining the path name of the output file.
    relative_path = 'example/'
    time_stamp = get_time_stamp()
    if scaffold_type is None:
        relative_path += ('other_scaffolds/' + time_stamp +
                          '_homogeneous_seqs.txt')
    else:
        relative_path += (scaffold_type + '_data/' + time_stamp + '_' +
                          scaffold_type + '_homogeneous_seqs.txt')
    absolute_path = os.path.join(ROOT_DIR, relative_path)
    # Open the input and output files.
    infile = smart_open(input_seqs, 'r')
    output_seqs = smart_open(absolute_path, 'w')
    # Retrieve modal length for sequences in input file.
    if scaffold_type == 'pTpA':
        modal_length = 110
    elif scaffold_type == 'Abf1TATA':
        modal_length = 115
    else:
        _, _, modal_length = get_max_min_mode_length_of_seqs(input_seqs)
    # Find seqs in input file w/ modal length and write them to output file
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        if len(seq) == modal_length:
            output_seqs.write(seq + '\t' + str(exp_level) + '\n')
        else:
            continue
    # Close the input and output files.
    infile.close()
    output_seqs.close()

    return absolute_path
Esempio n. 7
0
def encode_sequences_with_method(input_seqs,
                                 method='One-Hot',
                                 scale_els=True,
                                 model_type='1DCNN',
                                 binarized_els=False):
    """
    A wrapper function that encodes all of the sequences in an
    input file according to the specified method, and returns
    them in a numpy array, as well as returning the associated
    expression levels in a separate numpy array.

    Args:
    -----
        input_seqs (str) -- absolute path of the file containing
        all of the input sequences to be encoded, tab-separated
        withtheir associated expression levels. The first line of
        the file must be the number of sequences in the file, of
        the format: "number_of_seqs_in_file\t<###>" where <###> is
        the number of sequences in the file. The second line in the
        file must be the length to which all sequences are padded,
        of the format: "length_of_each_sequence\t<###>" where <###>
        is the length of every sequence in the file. Assumes
        homogeneity and/or padding of sequences.

        method (str) -- the method by which the sequence should be
        encoded. Must choose from: 'One-Hot'. Default: 'One-Hot'

        scale_els (bool) -- if True (default), scales all of the
        expression levels in the output list exp_levels to between
        -1 and 1, corresponding to the min and max values
        respectively.

        model_type (str) -- the type of model being used. Controls
        the shape of the returned list that contains the encoded
        sequences. Must be one of: '1DCNN' (for 1D-convolutional
        net), '1DLOCCON' (for 1D-locally connected net), or 'LSTM'
        (for Long-Short-Term-Memory net).

    Returns:
    -----
        encoded_seqs (numpy.ndarray) -- a list of all the sequences
        in the input file, encoded with the specified method. Each
        element (i.e. each encoded sequence) is of type list. Shape
        of this array depends on 'model_type'. For example, for an
        input file containing 10000 sequences, each of length 257,
        where the length of each base vector is 5 (corresponding to
        bases A,T,G,C,N), the output shapes of encoded_seqs for each
        model is as follows:
        '1DCONV'   ===> (10000, 257, 5)
        '1DLOCCON' ===> (10000, 257, 5)
        'LSTM'     ===> (10000, 1, 1285) where 1285=257*5

        exp_levels (numpy.ndarray) -- a list of all the expression
        levels associated with the sequences. Each element (i.e.
        each EL) is of type float. Values scaled to between -1 and
        1 if argument 'scale_els=True'.

        abs_max_el (float) -- the maximum expression level value in the
        input file. Returned only if 'scale_els=True'.

    """
    # Assertions
    assert isinstance(input_seqs, str), 'TypeError: Input file path must be \
    passed as a string.'

    assert isinstance(method, str), 'TypeError: Specified method must be a \
    a string.'

    assert method in METHODS, 'Must specify one the method of encoding the \
    sequence. Choose one of: %s' % (METHODS)
    assert isinstance(scale_els, bool), 'scale_els argument must be passed\
    as a bool.'

    assert isinstance(model_type, str), 'model_type argument must be passed\
    as a string.'

    assert model_type in MODELS, 'Must specify model_type as one of the\
    following: %s' % (MODELS)
    # Functionality
    # Open input file
    infile = smart_open(input_seqs, 'r')
    # Initialize output lists, preallocating dimensions for speed.
    num_seqs, len_seq = organize.get_num_and_len_of_seqs_from_file(input_seqs)
    encoded_seqs = np.zeros((int(num_seqs), int(len_seq), 5)).astype(int)
    exp_levels = np.zeros(int(num_seqs))
    # Encode sequences
    line_number = -3
    for line in infile:
        line_number += 1
        if line_number < 0:
            continue  # skip first 2 lines of the file
        line = check_valid_line(line)
        if line == 'skip_line':
            continue  # skip line if not a valid line
        seq, exp_level = separate_seq_and_el_data(line)
        # Encode with One-Hot method
        if method == 'One-Hot':
            try:
                encoded_seq = one_hot_encode_sequence(seq)
            except Exception:
                raise AssertionError('Error on line %s' % (line_number))
        # Encode with another method, i.e. embedding
        else:
            # Another encoding method will go here
            # encoded_seq = another_encoding_method(seq)
            pass
        # Assign encoded sequences and expression levels to output arrays
        encoded_seqs[line_number] = encoded_seq
        exp_levels[line_number] = exp_level
    # Close the input file
    infile.close()
    # Reshape array if needed as input to LSTM model
    if model_type == 'LSTM':
        encoded_seqs = encoded_seqs.reshape(int(num_seqs), -1)
        encoded_seqs = encoded_seqs.reshape(int(num_seqs), 1,
                                            (int(len_seq) * 5))
    # Scale expression level values to between -1 and 1
    if scale_els:
        abs_max_el = abs(max(exp_levels, key=abs))  # the absolute max value
        # numpy allows easy division of all elements at once
        exp_levels = exp_levels / abs_max_el
    else:  # If no scaling required
        abs_max_el = None
    # If expression levels are binarized, convert them from float ---> int
    if binarized_els:
        exp_levels = exp_levels.astype(int)

    return encoded_seqs, exp_levels, abs_max_el
Esempio n. 8
0
def remove_flanks_from_all_seqs(input_seqs, scaffold_type='pTpA'):
    """
    Removes all of the flanking sequences from an input file of
    sequences and their expression levels (tab separated).
    Example input file:
    GSE104878_20160609_average_promoter_ELs_per_seq_pTpA_ALL.
    shuffled.txt.gz from
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104878

    Args:
    -----
        input_seqs (str) -- the absolute pathname of the file
        containing all of the input sequences and their expression
        levels (tab separated).

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        that the input sequences had their expression levels
        measured in.

    Returns:
    -----
        out_abs_path (str) -- the absolute path for the output file
        containing all of the sequences with their flanks removed,
        along with their expression levels (tab separated).
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Input file pathname must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'Input file does not exist.'
    assert isinstance(scaffold_type, str), 'Scaffold type must be passed \
    as a string.'

    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Input \
    scaffold type must be either pTpA or Abf1TATA.'

    # Check that all of the flank sequences are the same in all
    # sequences in the input file.
    incorrect = organize.check_oligonucleotide_flanks(input_seqs,
                                                      scaffold_type)
    assert len(incorrect) == 0, 'Not all sequences in input file have same \
    flanking sequences. Error on line %s' % str(incorrect)
    # Functionality
    # Defining the pathname for the output file.
    time_stamp = get_time_stamp()  # Get unique time stamp for file naming
    relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' +
                     scaffold_type + '_seqs_flanks_removed.txt')
    absolute_path = os.path.join(ROOT_DIR, relative_path)
    # Opening the input and output files.
    infile = smart_open(input_seqs, 'r')
    outfile = smart_open(absolute_path, 'w')
    # Remove flanks and write data to output file.
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        deflanked_seq = remove_flanks_from_seq(seq, scaffold_type)
        outfile.write(deflanked_seq + '\t' + str(exp_level) + '\n')
    # Close the input and output files.
    infile.close()
    outfile.close()

    return absolute_path
Esempio n. 9
0
def pad_sequences(input_seqs, pad_front=False, extra_padding=0):
    """
    Pads sequences in an input file to the length of the longest
    sequence in the file, plus any extra padding if specified.
    Pads the sequences at either the front or the back, with 'N'
    characters.

    Args:
    -----
         input_seqs (str) -- the absolute path of the input file
        containing the sequences to be padded and their associated
        expression levels, tab separated.

        pad_front (bool) -- If True, will add padding to the front
        of the sequences. If False (default) pads sequences at the
        end (i.e. the RHS of the sequences).

        extra_padding (int) -- The number of extra null bases to
        add onto the front/back of the sequence

    Returns:
    -----
        absolute_path (str) -- the absolute path of the output file
        containing all of the padded sequences and their associated
        expression levels, tab separated.
    """
    # Assertions
    assert isinstance(input_seqs, str), 'Pathname of input file must be \
    passed as a string.'

    assert os.path.exists(input_seqs), 'File does not exist.'
    assert isinstance(pad_front, bool), 'The pad_front variable must be \
    passed as a bool.'

    assert isinstance(extra_padding, int), 'The amount of extra padding must \
    be passed as an integer.'

    assert extra_padding >= 0, 'The amount of extra padding must be passed as \
    a non-negative integer.'

    # Functionality
    # Define and open the output file
    absolute_path = input_seqs.replace('.txt', '_padded.txt')
    outfile = smart_open(absolute_path, 'w')
    # Retrieve input sequences, pad them, and write them to output file
    max_length, _, _ = organize.get_max_min_mode_length_of_seqs(input_seqs)
    pad_length = max_length + extra_padding
    with smart_open(input_seqs) as f:
        for line in f:
            line = check_valid_line(line)
            if line == 'skip_line':
                continue
            seq, exp_level = separate_seq_and_el_data(line)
            difference = pad_length - len(seq)
            if difference == 0:  # No need for padding
                padded_seq = seq
            else:  # Need to pad
                padding_seq = 'P' * difference
                if pad_front:
                    padded_seq = padding_seq + seq
                else:  # pad the end of the sequence
                    padded_seq = seq + padding_seq
            outfile.write(padded_seq + '\t' + str(exp_level) + '\n')
    # Close the output file
    outfile.close()

    return absolute_path
Esempio n. 10
0
def insert_all_seq_into_one_scaffold(input_seqs, scaffold_type='pTpA'):
    """
    Takes an input file containing N sequences and inserts them into
    a single scaffold sequence, outputting the N unique promoter
    sequences to an output file along with their expression levels
    (tab separated).

    Args:
    -----
        input_seqs (str) -- the absolute path for the input file
        containing all the oligonucleotide sequences to be inserted
        into the single scaffold sequence. All sequences must be of
        the same length as the scaffold variable region.

        scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA)
        that the input sequences had their expression levels
        measured in. Default: 'pTpA'.

    Returns:
    -----
        absolute_path (str) -- the absolute path for the output file
        containing all of the complete promoter sequences (where each
        input sequence has been inserted into the scaffold sequence).
    """
    # Assertions
    assert isinstance(input_seqs, str), 'TypeError: pathname for input file \
    must be a string.'

    assert isinstance(scaffold_type, str), 'Scaffold type must be passed as \
    a string.'

    assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \
    type must either be passed as "pTpA" or "Abf1TATA".'

    # Functionality
    time_stamp = get_time_stamp()  # get time stamp for unique file naming
    relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' +
                     scaffold_type + '_seqs_inserted_into_scaffold.txt')
    absolute_path = os.path.join(ROOT_DIR, relative_path)
    # Open input and output files
    infile = smart_open(input_seqs, 'r')
    outfile = smart_open(absolute_path, 'w')
    # Retrieve the scaffold sequence
    scaff_directory = 'example/' + scaffold_type + '_data/'
    scaff_rel_path = scaff_directory + scaffold_type + '_scaffold.txt'
    scaff_abs_path = os.path.join(ROOT_DIR, scaff_rel_path)
    scaff_file = smart_open(scaff_abs_path, 'r')
    scaffold = scaff_file.readline().replace('\n', '')
    # Insert sequences into scaffold and write data to output file
    for line in infile:
        line = check_valid_line(line)
        if line == 'skip_line':
            continue
        seq, exp_level = separate_seq_and_el_data(line)
        complete_seq = insert_seq_into_scaffold(seq, scaffold)
        outfile.write(complete_seq + '\t' + str(exp_level) + '\n')
    # Close the input, output, and scaffold files.
    infile.close()
    outfile.close()
    scaff_file.close()

    return absolute_path