Esempio n. 1
0
def get_by_directory(directory):
    """ Renames all files in directory based on filename

    :param directory: (str)		Name of directory with files to rename
    """
    # Correct passed data type based on extension
    # New directory name and create if not exists
    new_location = os.path.join(directory, "tmp")
    if not os.path.isdir(new_location):
        os.makedirs(new_location)
    # Get all files in user-passed directory
    for file in [f for f in os.listdir(directory) if os.path.isfile(f)]:
        # Split file name from extension, get name of old file
        data_type = BioOps.get_type(file)
        _file, ext = os.path.splitext(file)
        fullpath_old = os.path.join(directory, file)
        # Get all records in file
        records_in_file = BioOps.parse_large(fullpath_old, data_type)
        # Rename ids based on file name, add number to end to distinguish
        num = 0
        for record in records_in_file:
            record.id += "_" + _file + "_" + str(num)
            num += 1
        # Write renamed version of file to .tmp directory
        fullpath_new = os.path.join(new_location, _file + ext)
        with open(fullpath_new, "w") as O:
            SeqIO.write(records_in_file, O, data_type)
Esempio n. 2
0
def get_seq_from_file(file_name, seq_id):
    """ Locate sequence by ID/description (regex supported) and save to file

    :param file_name: (str)	User-passed name of file
    :param seq_id: (str)	User-passed sequence id to get from file
    """
    file_type = BioOps.get_type(file_name)
    value_to_return = set()
    seq_num = None
    data = {}
    try:
        data = {
            record.id: record
            for record in BioOps.parse_large(file_name, file_type)
        }
        value_to_return = data[seq_id]
    # If value is not found in parsed file, try regex
    except KeyError:
        seq_num = 0
        possible_seqs = list(data.keys())
        possible_descriptions = {
            val.id: val.description
            for val in data.values()
        }
        is_found = False
        # Try each sequence to see if matches regex
        for seq in possible_seqs:
            found = re.search(seq_id, seq)
            if found:
                value_to_return.add(data[seq])
                is_found = True
                seq_num += 1
        # Also check description to see for regex matches
        for _id, desc in possible_descriptions.items():
            found = re.search(seq_id, desc)
            if found:
                value_to_return.add(data[_id])
                is_found = True
                seq_num += 1
        if not is_found:
            print("Could not locate %s in %s" % (seq_id, file_name))
            raise SequenceIdNotFoundError(seq_id)
    file_name = os.path.splitext(file_name)[0]
    # Write found sequence to file
    out_file = "{}.{}".format(seq_id, file_name)
    W = open(out_file, "w")
    SeqIO.write(value_to_return, out_file, file_type)
    W.close()
    if not seq_num:
        print(" Sequence {} copied from file {} to {}".format(
            seq_id, file_name, out_file))
    else:
        print("{} sequence(s) copied from file {} to {}".format(
            seq_num, file_name, out_file))
Esempio n. 3
0
def reverse_complement_file(file_name):
    """ Reverse complements fasta sequences in file

    :param file_name: (str)	Name of file
    """
    data_type = BioOps.get_type(file_name)
    records_in_file = BioOps.parse_large(file_name, data_type)
    for record in records_in_file:
        record.reverse_complement()
        record.id += ".revComp"
    SeqIO.write(records_in_file, "tmp." + file_name, data_type)
Esempio n. 4
0
    def rewrite_ids_in_fastx(image_mapper, file_name, remove_description=True):
        """ Rename fastx ids using index file, write to new file

        :param remove_description: (bool)   Only write ids in new file
        :param image_mapper: (IndexMapper)	Mapped index file
        :param file_name: (str)	Name of fastx file
        """
        data_type = BioOps.get_type(file_name)
        records = BioOps.parse_large(file_name, data_type)
        for record in records:
            record.id = image_mapper.get(record.id)
            if remove_description:
                record.description = ""
        SeqIO.write(records, file_name + IndexExtensions.match[data_type],
                    data_type)
Esempio n. 5
0
def get_by_file(file_name):
    """ Renames sequences in file based on file name

    :param file_name: (str)	Name of file to rename
    """
    # Location to write data
    new_location = file_name + ".tmp"
    # Parse data file
    data_type = BioOps.get_type(file_name)
    records_in_file = BioOps.parse_large(file_name, data_type)
    num = 0
    for record in records_in_file:
        # Rename sequence id based on file name and a number for uniqueness
        record.id = file_name + "_" + str(num)
        num += 1
    # Write sequences to new file
    with open(new_location, "w") as O:
        SeqIO.write(records_in_file, O, data_type)
Esempio n. 6
0
def get_seqs_from_file(file_name, number):
    """ Retrieve a number of sequences from a file

    :param file_name: (str)	Name of file to parse
    :param number: (str)	User-passed number, list or numbers, or range to get
    """
    indices_to_get = _translate_number_string_to_interval(number)
    to_return = []
    data_type = BioOps.get_type(file_name)
    data = BioOps.parse_large(file_name, data_type)
    # Build list of values to return based on user-passed indices (list, range, or value)
    for index in indices_to_get:
        to_return.append(data[index])
    # Write found sequence to file
    out_file = "selected.{}".format(file_name)
    W = open(out_file, "w")
    SeqIO.write(to_return, out_file, data_type)
    W.close()
    print(" {} sequences copied from file {} to {}".format(
        len(indices_to_get), file_name, out_file))
Esempio n. 7
0
def summarize_file(file_name, view):
    """ Outputs character count summary of file

    :param view: (str)          User-passed view value
    :param file_name: (str)		Name of file for which to gather character data
    """
    # Corrected view name and data type
    data_type = BioOps.get_type(file_name)
    view = _view_corrector(view)
    # File to read
    record_metadata = {}
    num_records = 0
    R = open(file_name, "rb")
    # Collect file data
    data = []
    if data_type == "fasta":
        # Read in file by fasta header, removing newlines
        for key, group in it.groupby(
                R, lambda line: line.decode().startswith(">")):
            data.append(
                [gr.decode().rstrip("\r\n").strip(">") for gr in list(group)])
        # Retain record number
        num_records = len(data) // 2
        # Collect metadata by record
        for i in range(0, len(data), 2):
            record_metadata[data[i][0]] = (Counter(data[i][0]),
                                           Counter("".join(data[i + 1])))
    elif data_type == "fastq":
        # Read in file by fastq header, removing newlines
        for key, group in it.groupby(R, lambda line: line.startswith("@")):
            data.append([gr.rstrip("\r\n").strip("@") for gr in list(group)])
        # Retain record number
        num_records = len(data) // 2
        # Collect metadata by record
        # Note indices based on presence on "+" in records
        for i in range(0, len(data), 2):
            plus_loc = data[i + 1].index("+")
            record_metadata[data[i][0]] = (Counter(
                data[i][0]), Counter("".join(data[i + 1][:plus_loc])),
                                           Counter("".join(
                                               data[i + 1][plus_loc + 1:])))
    R.close()
    # Output general summary
    _summary_all(num_records, file_name)
    # Output based on user-passed value
    if view == "s":
        _summary_short(record_metadata)
    elif view == "l":
        _summary_long(record_metadata)
Esempio n. 8
0
def remove_ambiguity_from_file(file_name):
    """ Removes N from data file in-line

    :param file_name: (str)	Name of file to edit
    """
    data = []
    data_type = BioOps.get_type(file_name)
    records_in_file = BioOps.parse_large(file_name, data_type)
    for record in records_in_file:
        # Build new sequence
        new_seq = ""
        for val in record.seq:
            # Skip ambiguous character
            if val.upper() not in ("R", "Y", "W", "S", "M", "K", "H", "B", "V",
                                   "D", "N"):
                new_seq += val
        # Try DNA sequence type first (more restrictive)
        try:
            record.seq = Seq(new_seq, IUPAC.unambiguous_dna)
        # Default to protein otherwise
        except:
            record.seq = Seq(new_seq, IUPAC.unambiguous_rna)
        data.append(record)
    SeqIO.write(data, file_name, data_type)
Esempio n. 9
0
    def make_idx_for_directory_file_names(directory_name):
        """ Make index file for filenames in a directory

        :param directory_name: (str)    Name of directory with files to rename
        :return:
        """
        current_dir = os.getcwd()
        W = open(
            os.path.join(
                current_dir,
                os.path.dirname(directory_name) + IndexExtensions.IDX_FILE),
            "wb")
        files_in_directory = set(file for file in os.listdir(directory_name))
        for _file in files_in_directory:
            data_type = BioOps.get_type(_file)
            W.write(("%s\t%s\n" % (_file, IndexCreator._line_edit(16) + "." +
                                   data_type)).encode())
        W.close()
Esempio n. 10
0
def _summary_long(record_metadata):
    """ Print summary that highlights each record's information

    :param record_metadata: (Dict[str, Tuple[Counter, Counter]])	Dict with file data
    """
    sorted_keys = sorted(record_metadata.keys())
    for _id in sorted_keys:
        met_tuple = record_metadata[_id]
        print("Id: %s" % _id)
        print("Values found in header:")
        for k, v in met_tuple[0].items():
            print("# %s: %i" % (k, v))
        print("Values found in sequences:")
        for k, v in met_tuple[1].items():
            print("# %s: %i" % (k, v))
        if len(met_tuple) == 3:
            print("Values found in quality scores:")
            for k, v in met_tuple[2].items():
                print("# %s: %i" % (BioOps.calculate_phred([k])[0], v))
Esempio n. 11
0
def _summary_short(record_metadata):
    """ Short summary that highlighting all data

    :param record_metadata: (Dict[str, Tuple[Counter, Counter]])	Dict with file data
    """
    all_header_metadata = Counter("")
    all_sequence_metadata = Counter("")
    all_quality_metadata = Counter("")
    for met_tuple in record_metadata.values():
        all_header_metadata += met_tuple[0]
        all_sequence_metadata += met_tuple[1]
        if len(met_tuple) == 3:
            all_quality_metadata += met_tuple[2]
    print("Values found in headers:")
    for k, v in all_header_metadata.items():
        print("# %s: %i" % (k, v))
    print("Values found in sequences:")
    for k, v in all_sequence_metadata.items():
        print("# %s: %i" % (k, v))
    if len(all_quality_metadata.keys()) > 0:
        print("Values found in quality scores:")
        for k, v in all_sequence_metadata.items():
            print("# %s: %i" % (BioOps.calculate_phred([k])[0], v))