Ejemplo n.º 1
0
def fasta_counter(fasta_file):
    """Counts entries in a FASTA protein database.
        Call with FASTA filename.
        Checks for duplicate accessions and (optional) valid characters.
    """
    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj)

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    head = 0
    conflict = {}

    # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4
    while f.readNextProtein(p, check_for_errs=False):

        # count protein sequences
        prot += 1
        if (prot % 500000) == 0:
            print('......(%s proteins read...)' % ("{0:,d}".format(prot), ))

##        # check for duplicate accession
##        dup = conflict.get(p.accession, False)
##        if dup:
##            for obj in write:
##                print('\n...WARNING: %s is already in FASTA database!\n' % (p.accession,), file=obj)
##                if p.molwtProtein(show_errs=False) == conflict[p.accession]:
##                    print('......possible duplicated sequence...', file=obj)
##        else:
##            conflict[p.accession] = p.molwtProtein(show_errs=False)

# count number of header elements
        control_A = p.description.count(chr(1))
        head = head + control_A + 1

    # print results and return
    for obj in write:
        print('...there are %s proteins in %s' %
              ("{0:,d}".format(prot), os.path.split(fasta_file)[1]),
              file=obj)
        if head > prot:
            print('...there were %s header lines' % ("{0:,d}".format(head), ),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj)
    log_obj.close()
    return
Ejemplo n.º 2
0
# browse to the database
database = r"C:\Xcalibur\database"
if not os.path.exists(database):
    database = os.getcwd()
fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta')],
                                'Select a TriTryp FASTA database')
if fasta_file == '': sys.exit()  # cancel button repsonse

# build new database name
new_fasta_file = os.path.basename(fasta_file)
new_fasta_file = new_fasta_file.replace('.fasta', '_fixed.fasta')
new_fasta_file = os.path.join(os.path.dirname(fasta_file), new_fasta_file)

# initializations
proteins = []
p = fasta_lib.Protein()
pcount = 0
stop_count = 0
gap_count = 0
no_met = 0

# read the sequences into a list
f = fasta_lib.FastaReader(fasta_file)
while f.readNextProtein(p, check_for_errs=True):
    pcount += 1

    # parse the description string into a dictionary
    try:
        items = [x.strip() for x in p.description.split('|') if x]
        header_dict = {x.split('=')[0]: x.split('=')[1] for x in items}
        new_desc = []
def main(taxon_dict):
    """Extracts entries by taxon ID from both Sprot and Trembl databases.
    """
    print(
        '============================================================================='
    )
    print(
        ' uniprot_extract_from_both.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '============================================================================='
    )

    # get the UniProt folder and then get the sprot and trembl database names
    DB = []
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    uniprot_folder = fasta_lib.get_folder(
        default, title_string='Select a UniProt download folder')
    if uniprot_folder == '': sys.exit()  # cancel button response

    version = uniprot_folder.split('_')[-1]
    uniprot_db = 'uniprot'
    for files in os.listdir(uniprot_folder):
        if files.startswith('uniprot_') and files.endswith('.gz'):
            DB.append(os.path.join(uniprot_folder, files))
    if len(DB) != 2:
        print('WARNING: either sprot or trembl DB was missing')

    # create a log file to mirror screen output
    log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile(
        '\n>>> starting: uniprot_extract_from_both.py', log_obj)

    # make the smaller uniprot dictionaries
    (sci_to_taxon,
     id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder)

    # make the more complete dictionary
    name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder)

    # print the list of taxon numbers that will be extracted
    # NOTE: Any taxon numbers present in analysis text file will not be expanded.
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers
    if EXPAND_GROUPS:
        fasta_lib.expand_species(uniprot_folder, 'uniprot', taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT)

    # inititalize dictionaries and counters
    taxon_files, taxon_count, name_count = {}, {}, {}
    for taxon, name in taxon_dict.items():
        fname = uniprot_db + '_' + version + '_' + name + '.fasta'
        fname = os.path.join(uniprot_folder, fname)
        taxon_files[name] = fname
        taxon_count[taxon] = 0
        name_count[name] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # want to count extracted sequences from each database
    name_counter = {}
    number_counter = {}

    # loop over both databases and extract species
    duplicates = {}
    for i in range(len(DB)):
        prot_read = 0
        not_found = 0
        for value in taxon_dict.values():
            name_counter[value] = 0
        for key in taxon_dict.keys():
            number_counter[key] = 0

        # create a FastaReader object, initialize counters, and start reading
        uniprot_file = DB[i]
        x = fasta_lib.FastaReader(uniprot_file)
        prot = fasta_lib.Protein()
        for obj in write:
            print('...reading %s and extracting entries...' %
                  (os.path.split(uniprot_file)[1], ),
                  file=obj)

        # NOTE: checking for errors will slow program execution, use if needed
        while x.readNextProtein(prot, check_for_errs=False):
            prot_read += 1
            if (prot_read % 500000) == 0:
                print('......(%s proteins read...)' %
                      ("{0:,d}".format(prot_read), ))
            (spec_id,
             spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' +
                                                       prot.description)
            taxon = sci_to_taxon.get(spec_name, 0)  # first choice mapping
            taxon2 = name_to_taxon.get(spec_name, 0)  # alternative mapping
            if taxon == 0:  # first choice not present
                if taxon2 == 0:
                    not_found += 1
                else:
                    taxon = taxon2  # use second choice
            else:
                if (taxon != taxon2) and (
                        taxon2 > 0):  # keep track of multiple taxon numbers
                    duplicates[spec_name] = (taxon, taxon2)
            if taxon_dict.get(taxon, False):
                if CLEAN_ACCESSIONS:
                    prot.parseUniProt()

                # taxon number matches, so write the protein to respective output file(s)
                name = taxon_dict[taxon]
                name_counter[name] += 1
                name_count[name] += 1
                taxon_count[taxon] += 1
                number_counter[taxon] += 1
                f = taxon_files[name]
                prot.printProtein(f)

        # print extraction stats for each database
        for obj in write:
            print('...%s protein entries in %s' %
                  ("{0:,d}".format(prot_read), os.path.split(DB[0])[1]),
                  file=obj)
            print('...%s proteins had unknown taxon numbers' %
                  ("{0:,d}".format(not_found), ),
                  file=obj)
            numbers = list(number_counter.keys())
            numbers.sort()
            if VERBOSE:
                for j, number in enumerate(numbers):
                    if number_counter[number] > 0:
                        print('......(%s) taxon %s had %s proteins' %
                              (j + 1, number, "{0:,d}".format(
                                  number_counter[number])),
                              file=obj)
            names = list(name_counter.keys())
            names.sort()
            db_name = os.path.split(DB[i])[1]
            for j, name in enumerate(names):
                print('......(%s) %s %s proteins extracted' %
                      (j + 1, "{0:,d}".format(name_counter[name]), name),
                      file=obj)

    # close the extracted database files
    for f in taxon_files.values():
        f.close()

    # print list of mis-matched taxon number warnings
    if MISMATCHES:
        for i, (name, pair) in enumerate(duplicates.items()):
            for obj in write:
                print('......(%s) WARNING: %s and %s map to "%s"' %
                      (i + 1, pair[0], pair[1], name),
                      file=obj)

    # print out the final summary stuff
    for obj in write:
        if VERBOSE:
            print('...combined taxon counts...', file=obj)
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...combined output file counts...', file=obj)
        for i, name in enumerate(names):
            print('......(%s) %s total proteins written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]),
                   uniprot_db + '_' + version + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_both.py',
                                 log_obj)
    log_obj.close()
    return
    def make_fasta_files(self, uniprot_dir_path, entry):
        """Uncompresses canonical FASTA file and does some analysis. Also
        combines fasta and additional fasta files with decompression.
        """
        # Get the list of protein fasta files
        temp_files = [
            "{}_{}".format(self.date, x) for x in entry.ftp_download_list
            if 'fasta' in x.lower()
        ]
        fasta_files = []
        combined_files = []
        for f in temp_files:
            if not self.banned_file(f):
                fasta_files.append(f)
        fasta_files.sort()

        fasta_file = fasta_files[0].replace('.fasta.gz', '')
        fasta_file = fasta_file + '_' + entry.short_name + '_canonical.fasta'
        combined_files.append(fasta_file)
        fasta_obj_list = [
            open(os.path.join(uniprot_dir_path, fasta_file), 'w')
        ]
        if len(fasta_files) == 2:
            fasta_file = fasta_files[1].replace('_additional.fasta.gz', '')
            fasta_file = fasta_file + '_' + entry.short_name + '_all.fasta'
            fasta_obj_list.append(
                open(os.path.join(uniprot_dir_path, fasta_file), 'w'))
            combined_files.append(fasta_file)

        # Set up to read the fasta file entries and init counters
        print('proteome:', entry.proteome_ID, 'species:', entry.species_name)
        p = fasta_lib.Protein()

        # Read entries and write to new file
        for i, fasta in enumerate(fasta_files):
            sp_count = 0
            iso_count = 0
            tr_count = 0
            p_count = 0
            f = fasta_lib.FastaReader(
                os.path.join(uniprot_dir_path, entry.download_folder_name,
                             fasta))
            while f.readNextProtein(p, False):
                p_count += 1
                if p.accession.startswith('sp|'):
                    sp_count += 1
                if p.accession.startswith('tr|'):
                    tr_count += 1
                if ('-' in p.accession) or ('Isoform of' in p.description):
                    iso_count += 1
                if i == 0:
                    for obj in fasta_obj_list:
                        p.printProtein(obj)
                else:
                    p.printProtein(fasta_obj_list[i])

            # Print stats
            print('...database:', fasta)
            print(
                '......tot_count: %s, sp count: %s, tr count: %s, isoform count: %s'
                % ("{0:,}".format(p_count), "{0:,}".format(sp_count),
                   "{0:,}".format(tr_count), "{0:,}".format(iso_count)))

        # Close output file(s)
        for obj in fasta_obj_list:
            obj.close()

        # chdir into correct folder and make sure all file paths are set up correctly
        uniprot_dir_name = r"UniProt_{}".format(self.date)
        os.chdir(os.path.join(self.abs_download_path, uniprot_dir_name))

        # Add forward/reverse/contams
        for file in combined_files:
            self.database_processing(file, self.contams_database)
def main(fasta_file):
    """Checks entries in a FASTA protein database for identical duplicates.
        Call with FASTA filename, returns a couple of dictionaries
    """
    print(
        '======================================================================='
    )
    print(
        ' check_for_duplicates.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '======================================================================='
    )

    # set up the output file names, etc.
    folder = os.path.split(fasta_file)[0]
    out_file = os.path.join(folder, 'duplicates.txt')
    out_obj = open(out_file, 'w')

    # create instances of reader object and protein object, initialize counter
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    head = 0
    dup = 0
    conflict = {}

    # read proteins until EOF
    while f.readNextProtein(p, check_for_errs=False):
        prot += 1
        control_A = p.description.count(chr(1))
        head = head + control_A + 1
        dup_data = (p.seqlenProtein(), p.molwtProtein())
        value = p.accession
        duplicate = conflict.get(dup_data, False)
        if duplicate:
            dup += 1
            print('...WARNING: (protein no. %s) %s may be same as %s' %
                  (prot, p.accession, duplicate),
                  file=out_obj)
        else:
            conflict[dup_data] = value

    # print result and return
    print('...there are %s proteins in %s' %
          (prot, os.path.split(fasta_file)[1]))
    if head > prot:
        print('...there were %s header lines...' % (head, ))
    print('...there were %s possible duplicates...' % (dup, ))
    out_obj.close()

    # rewind out file and build new dictionaries
    out_obj = open(out_file, 'r')
    conflict = {}
    to_save = {}
    while True:
        line = out_obj.readline()
        if not line:
            break
        else:
            line = line.split()
        conflict[line[4]] = line[9]
        to_save[line[4]] = True
        to_save[line[9]] = True
    out_obj.close()

    # read in and save the proteins that might be duplicates of each other
    candidates = []
    i = 0
    dup2 = 0
    index = {}
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    while f.readNextProtein(p, check_for_errs=False):
        if to_save.get(p.accession, False):
            candidates.append(copy.deepcopy(p))
            index[p.accession] = i
            i += 1
    if len(candidates) == 0:
        print('to_save_dictionary:', to_save)
        print('bailing out in middle')
        return (candidates, index)

    # look deeper to see if candidates are actually duplicates
    out_obj = open(out_file, 'a')
    print('\n========================================\n', file=out_obj)
    exact_dup = 0
    accessions = list(conflict.keys())
    accessions.sort()
    for acc in accessions:
        p_dup = candidates[index[acc]]
        dup_acc = conflict[acc]
        p_ref = candidates[index[dup_acc]]
        if p_ref.sequence == p_dup.sequence:
            exact_dup += 1
            print('...(%s) WARNING: %s exact match to %s' %
                  (exact_dup, p_ref.accession, p_dup.accession),
                  file=out_obj)
    print('...number of exact matches was', exact_dup)

    return (candidates, index)
Ejemplo n.º 6
0
def main(fasta_file):
    """Checks entries in a FASTA protein database for identical duplicates.
        Call with FASTA filename, returns a couple of dictionaries
    """
    print(
        '====================================================================')
    print(
        ' remove_duplicates.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print(
        '====================================================================')
    print('\n   Analysis results will be written to "duplicates.txt"')

    # set up the output file names, etc.
    folder = os.path.split(fasta_file)[0]
    out_file = os.path.join(folder, 'duplicates.txt')
    out_obj = open(out_file, 'a')
    for end in ['.fasta', '.fasta.gz', '.fa.gz']:
        if fasta_file.endswith(end):
            nr_database = fasta_file.replace(end, '_nonredun.fasta')
    if (not nr_database) or (nr_database == fasta_file):
        nr_database = fasta_file + '_nonredun.fasta'
    nr_obj = open(nr_database, 'w')
    write = [None, out_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: check_for_duplicates.py',
                                 out_obj)
    #

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot, head, dup = 0, 0, 0
    candidates = {}  # dictionary of acc:protein lists
    conflicts = {}  # keeps track of seq len and MW

    # read proteins until EOF
    while f.readNextProtein(p, check_for_errs=False):
        prot += 1
        control_A = p.description.count(chr(1))
        head = head + control_A + 1
        dup_data = (p.seqlenProtein(), p.molwtProtein())
        duplicate = conflicts.get(dup_data, False)
        if duplicate:
            dup += 1
            if candidates.get(duplicate, False):
                candidates[duplicate].append(copy.deepcopy(p))
            else:
                candidates[duplicate] = [copy.deepcopy(p)]
        else:
            conflicts[dup_data] = p.accession

    # get list of proteins to test for identity
    to_test = {}
    for key in candidates.keys():
        to_test[key] = True

    # copy proteins to "nr" file, checking for duplicates
    dup = 0
    skip = {}
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    print('Processing:', fasta_file, file=out_obj)  # header line to log file
    while f.readNextProtein(p, check_for_errs=False):
        if to_test.get(p.accession, False):
            dup += find_identities(p, candidates, skip, fasta_file, out_obj)
        if skip.get(p.accession, False):
            continue
        p.printProtein(nr_obj)

    for obj in [None, out_obj]:
        print('\nThere were',
              prot,
              'total sequences in:',
              os.path.basename(fasta_file),
              file=obj)
        print('There were', dup, 'identical sequences removed\n\n', file=obj)
        try:
            obj.close()
        except AttributeError:
            pass
    return
Ejemplo n.º 7
0
def fasta_checker(fasta_file, write):
    """Checks FASTA files for non-standard amino acid characters.
    """
    for obj in write:
        print("  database:", os.path.basename(fasta_file), file=obj)

    # initializations
    proteins = []
    p = fasta_lib.Protein()

    # counters
    prot_count = 0
    no_start_met = 0
    stop_count = 0
    stop_end = 0
    gap_count = 0
    B_count = 0
    J_count = 0
    O_count = 0
    U_count = 0
    X_count = 0
    Z_count = 0

    # read the sequences into a list
    f = fasta_lib.FastaReader(fasta_file)
    while f.readNextProtein(p, check_for_errs=True):
        prot_count += 1

        # test for odd amino acids, stop codons, gaps
        if not p.sequence.startswith('M'):
            no_start_met += 1
        if p.sequence.endswith('*'):
            stop_end += 1
        if '*' in p.sequence:
            stop_count += 1
        if '-' in p.sequence:
            gap_count += 1
        if 'B' in p.sequence:
            B_count += 1
        if 'J' in p.sequence:
            J_count += 1
        if 'O' in p.sequence:
            O_count += 1
        if 'U' in p.sequence:
            U_count += 1
        if 'X' in p.sequence:
            X_count += 1
        if 'Z' in p.sequence:
            Z_count += 1

        # save the protein in list
        proteins.append(copy.deepcopy(p))

    # check for duplicates and count
    duplicate_count = 0
    mw_dict = {}
    for i, p in enumerate(proteins):
        if mw_dict.get(str(p.molwtProtein()), False):
            j = mw_dict[str(p.molwtProtein())]
            if p.sequence == proteins[j].sequence:
                duplicate_count += 1
        else:
            mw_dict[str(p.molwtProtein())] = i

    # print out the report of oddball characters
    for obj in write:
        print("  total number of input sequences was:", prot_count, file=obj)
        print("  number of redundant sequences was:", duplicate_count, file=obj)
        print("    translations that do not start with Met:", no_start_met, file=obj)
        print("    translations that ended with a stop codon:", stop_end, file=obj)
        print("    translations that had premature stop codons:", stop_count, file=obj)
        print("    translations that contained gaps:", gap_count, file=obj)
        print("    translations that had B (ambiguous N/D):", B_count, file=obj)
        print("    translations that had J (ambiguous I/L):", J_count, file=obj)
        print("    translations that had O (pyrrolysine):", O_count, file=obj)
        print("    translations that had U (selenocysteine):", U_count, file=obj)
        print("    translations that had X (unknown amino acid):", X_count, file=obj)
        print("    translations that had Z (ambiguous Q/E):", Z_count, file=obj)

    return
def fasta_add_extras(extra_file, fasta_file, output_file):
    """Adds contaminants and reverses entries in a FASTA protein database.
        Called with FASTA filename.  Reversed DB written to same location.
        Options for separate or concatenated output files.
    """
    decoy_string = 'REV_'   # the string to denote decoy sequences

    print('=========================================================================')
    print(' add_extras_and_reverse.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print('=========================================================================')

    # open the "forward" and "reversed" output files
    _file = os.path.splitext(output_file)[0] + '.fasta'
    for_name = _file.replace('.fasta', '_for.fasta')
    for_file_obj = open(for_name, 'w')
    rev_name = _file.replace('.fasta', '_rev.fasta')
    rev_file_obj = open(rev_name, 'w')

    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: add_extras_and_reverse.py', log_obj)

    # create instances of reader object and protein object
    # add the extra sequences, accessions changed to "EXTRA_dddd"
    # NOTE: can only add up to 9999 sequences without code modification...
    prot = fasta_lib.Protein()
    pcount = 0
    f = fasta_lib.FastaReader(extra_file)

    # turn on error checking for extra sequences
    while f.readNextProtein(prot, check_for_errs=True):
        pcount += 1

        # try to clean up original accessions
        if CLEAN_ACCESSIONS:
            if prot.accession.startswith('gi|'):
                prot.parseNCBI(REF_SEQ_ONLY)
            elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'):
                prot.parseUniProt(KEEP_UNIPROT_ID)
            elif prot.accession.startswith('IPI:'):
                prot.parseIPI(KEEP_IPI_GENE_ID)
            else:
                pass

        # add old accession to description and make new accession
        # write original sequence to "forward" file and reversed to "reverse"
        prot.new_desc = '[%s] %s' % (prot.new_acc, prot.new_desc)
        prot.new_acc = 'EXTRA_%04d' % (pcount,)
        prot.accession = 'EXTRA_%04d' % (pcount,)
        prot.printProtein(for_file_obj)
        rev = prot.reverseProtein(decoy_string)
        rev.printProtein(rev_file_obj)
    for obj in write:
        print('...there were %s extra sequences in %s' % (pcount, os.path.split(extra_file)[1]), file=obj)

    # now add the contaminants
    try:
        if os.path.exists(CONTAMS):
            contams_file = CONTAMS
        else:
            path = os.path.split(fasta_file)[0]
            contams_file = os.path.join(path, CONTAMS)
        f = fasta_lib.FastaReader(contams_file)
        contams = 0
        while f.readNextProtein(prot, check_for_errs=True):
            pcount += 1
            contams += 1
            if CLEAN_ACCESSIONS:
                prot.parseCONT()

            # write sequences to respective files
            prot.printProtein(for_file_obj)
            rev = prot.reverseProtein(decoy_string)
            rev.printProtein(rev_file_obj)
        for obj in write:
            print('...there were %s contaminant entries in %s' % (contams, contams_file), file=obj)
    except:
        for obj in write:
            print('...WARNING:', CONTAMS, 'not found!', file=obj)

    # read proteins, clean up accessions, decriptions until EOF
    # write proteins to "forward" and "reversed" files
    f = fasta_lib.FastaReader(fasta_file)

    # checking for errors can slow program execution by factor of 3-4
    # Reading and writing sequences will always remove spaces and blank lines
    while f.readNextProtein(prot, check_for_errs=False):
        pcount += 1
        if CLEAN_ACCESSIONS:
            if prot.accession.startswith('gi|'):
                prot.parseNCBI(REF_SEQ_ONLY)
            elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'):
                prot.parseUniProt(KEEP_UNIPROT_ID)
            elif prot.accession.startswith('IPI:'):
                prot.parseIPI(KEEP_IPI_GENE_ID)
            else:
                pass

        prot.printProtein(for_file_obj)    # write to "forward" file
        rev = prot.reverseProtein(decoy_string)
        rev.printProtein(rev_file_obj)   # write to "reversed" file

    # make concatenated output file if desired and print summary stats
    if MAKE_SEPARATE_BOTH:
        both_name = _file.replace('.fasta', '_both.fasta')
        both_file_obj = open(both_name, 'w')
        for_file_obj.close()
        for_file_obj = open(for_name, 'r')
        rev_file_obj.close()
        rev_file_obj = open(rev_name, 'r')
        while True:
            line = for_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        while True:
            line = rev_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        both_file_obj.close()
        for obj in write:
            print('...%s total proteins written to %s' % (2*pcount, os.path.split(both_name)[1]), file=obj)

    if MAKE_SEPARATE_FORWARD:
        for obj in write:
            print('...%s proteins written to %s' % (pcount, os.path.split(for_name)[1]), file=obj)
    if MAKE_SEPARATE_REVERSED:
        for obj in write:
            print('...%s proteins reversed and written to %s' % (pcount, os.path.split(rev_name)[1]), file=obj)

    # close files and delete unwanted files
    for_file_obj.close()
    rev_file_obj.close()
    fasta_lib.time_stamp_logfile('>>> ending: add_extras_and_reverse.py', log_obj)
    log_obj.close()
    if not MAKE_SEPARATE_FORWARD:
        os.remove(for_name)
    if not MAKE_SEPARATE_REVERSED:
        os.remove(rev_name)
    return
Ejemplo n.º 9
0
def main(fasta_file, up_one=False):
    """Processes one Ensembl fasta file - reformats description lines, checks things.
    up_one determines where the new file is written.
    """
    # create the new database name
    original_fasta_file = os.path.basename(fasta_file)
    new_fasta_file = original_fasta_file.replace('.fasta', '_fixed.fasta')
    if new_fasta_file == original_fasta_file:
        new_fasta_file = original_fasta_file.replace('.fa', '_fixed.fasta')
    if new_fasta_file == original_fasta_file:
        print('WARNING! creating new file name failed')
        print('...make sure database is not compressed')
        return False
    if new_fasta_file.endswith('.gz'):
        new_fasta_file = new_fasta_file[:-3]
    if up_one:
        folder_name = os.path.dirname(os.path.dirname(fasta_file))
    else:
        folder_name = os.path.dirname(fasta_file)
    new_fasta_file = os.path.join(folder_name, new_fasta_file)

    # initializations
    proteins = []
    accessions = {}
    p = fasta_lib.Protein()
    pcount = 0      # sequence count
    dup_count = 0   # duplicate accession count
    stop_count = 0  # "*"
    gap_count = 0   # "-"
    no_met = 0      # does not start with M
    X_count = 0     # unknow AA
    B_count = 0     # N or D
    Z_count = 0     # Q or E
    J_count = 0     # I or L
    U_count = 0     # selenocysteine
    
    # set up the list of possible tags in header lines
    # this should probably be generalized somehow...
    all_tags = ['pep:', 'pep scaffold:', 'pep genescaffold:', 'pep chromosome:', 'pep contig:',
                'pep reftig:', 'pep supercontig:', 'pep ultracontig:', 'pep group:',
                'gene:', 'transcript:', 'gene_biotype:',
                'transcript_biotype:', 'gene_symbol:', 'description:']

    # read the sequences into a list
    f = fasta_lib.FastaReader(fasta_file)
    while f.readNextProtein(p, check_for_errs=True):
        pcount += 1
        
        # check if accession already seen
        if p.accession in accessions:
            dup_count += 1
            accessions[p.accession] += 1
            print('...WARNING: skipping duplicate accession:', p.accession)
            continue
        else:
            accessions[p.accession] = 1
        
        # clean up the description string
        p.new_desc = parse_ensembl_header_line(p.description, all_tags)
        
        # test for odd amino acids, stop codons, gaps
        if not p.sequence.startswith('M'):
            no_met += 1
            p.new_desc = p.new_desc + ' (No starting Met)'
        if '*' in p.sequence:
            stop_count += 1
            cut = p.sequence.index('*')
            string = ' (Premature stop %s/%s)' % (cut, len(p.sequence))
            p.new_desc = p.new_desc + string
            p.sequence = p.sequence[:cut]
        if '-' in p.sequence:
            gap_count += 1
            p.new_desc = p.new_desc + ' (has gaps)'
        if 'B' in p.sequence:
            B_count += 1
            p.new_desc = p.new_desc + ' (has B)'
        if 'Z' in p.sequence:
            Z_count += 1
            p.new_desc = p.new_desc + ' (has Z)'
        if 'J' in p.sequence:
            J_count += 1
            p.new_desc = p.new_desc + ' (has J)'
        if 'U' in p.sequence:
            U_count += 1
            p.new_desc = p.new_desc + ' (has U)'
        if 'X' in p.sequence:
            X_count += 1
            p.new_desc = p.new_desc + ' (has unknown X)'
        
        # save the protein in list
        proteins.append(copy.deepcopy(p))

    # open the new protein fasta file and write out the proteins
    fixcount = 0
    file_obj = open(new_fasta_file, 'w')
    for p in proteins:
        if len(p.sequence) > 0:
            p.printProtein(file_obj)
        else:
            print('   empty sequence (stop codon at start):', p.accession)
        fixcount += 1
    file_obj.close()

    # print(out the report of oddball characters
    print("   Ensembl database:", os.path.basename(fasta_file))
    print("   translations that do not start with Met:", no_met)
    print("   translations that have premature stop codons:", stop_count)
    print("   translations that contain gaps:", gap_count)
    print("   translations that contain X (unknowns):", X_count)
    print("   translations that contain B:", B_count)
    print("   translations that contain Z:", Z_count)
    print("   translations that contain J:", J_count)
    print("   translations that contain U:", U_count)
    print("   total number of input sequences was:", pcount)
    print("   total number of sequences written was:", fixcount)
    print("   number of duplicate accessions was:", dup_count)

    return new_fasta_file
Ejemplo n.º 10
0
def fasta_digester(fasta_file, enzyme='trypsin', log=[None]):
    """Trypsin digests entries in a FASTA protein database.
        Call with FASTA filename, returns list of proteins with
        theoretical tryptic digest peptide lists
        Checks for duplicate accessions and (optional) valid characters.
    """
    print('==================================================================')
    print(' fasta_digester.py, v 1.1.3, written by Phil Wilmarth, OHSU, 2017 ')
    print('==================================================================')

    # compile the regex for desired digestion
    if enzyme.upper() == 'No_enzyme'.upper():
        regex = re.compile(r".")
    elif enzyme.upper() == 'trypsin'.upper():  # checked
        regex = re.compile(r".(?:(?<![KR](?!P)).)*")
    elif enzyme.upper() == 'trypsin-P'.upper():  # checked
        regex = re.compile(r".(?:(?<![KR]).)*")
    elif enzyme.upper() == 'Lys-C'.upper():  # checked
        regex = re.compile(r".(?:(?<![K](?!P)).)*")
    elif enzyme.upper() == 'Lys-C-P'.upper():  # checked
        regex = re.compile(r".(?:(?<![K]).)*")
    elif enzyme.upper() == 'Lys-N'.upper():  # checked
        regex = re.compile(r".(?:(?![K]).)*")
    elif enzyme.upper() == 'Arg-C'.upper():  # checked
        regex = re.compile(r".(?:(?<![R](?!P)).)*")
    elif enzyme.upper() == 'Asp-N'.upper():  # checked
        regex = re.compile(r".(?:(?![D]).)*")
    elif enzyme.upper() == 'CNBr'.upper():  # checked
        regex = re.compile(r".(?:(?<![M]).)*")
    elif enzyme.upper() == 'Glu-C'.upper():  # checked
        regex = re.compile(r".(?:(?<![DE](?!P)).)*")
    elif enzyme.upper() == 'PepsinA'.upper():  # checked
        regex = re.compile(r".(?:(?<![FL](?!P)).)*")
    elif enzyme.upper() == 'chymotrypsin'.upper():  # checked
        regex = re.compile(r".(?:(?<![FWYL](?!P)).)*")
    else:
        print('...WARNING: Enzyme:', enzyme, 'not recognized')
        regex = None

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    head = 0
    all_peptides = {}
    print('starting file reading:', time.ctime())

    # read proteins until EOF; NOTE: checking for errors slows program by factor of 3 or 4
    while f.readNextProtein(p, check_for_errs=False):

        # digest protein sequence (regex expression, low mass cutoff, high mas cutoff,
        # minimum peptide length, maximum number of missed cleavages, type of masses)
        p.enzymaticDigest(regex, 500.0, 5000.0, 7, 2, 'mono')

        for pep in p.peptides:

            # mask I and L residues
            mass_spec_seq = re.sub(r'[IL]', 'j', pep.seq)

            # make dictionary of sequences and counts
            if all_peptides.get(mass_spec_seq):
                all_peptides[mass_spec_seq] += 1
            else:
                all_peptides[mass_spec_seq] = 1

        # count protein sequences
        prot += 1
        if (prot % 500000) == 0:
            print('......(%s proteins read...)' % (prot, ))

        # count number of header elements
        control_A = p.description.count(chr(1))
        head = head + control_A + 1

    # print number of proteins/headers and return peptide dictionary
    for obj in log:
        print('There are %s proteins in %s' %
              ("{0:,d}".format(prot), os.path.basename(fasta_file)),
              file=obj)
        if head > prot:
            print('There were %s header lines' % (head, ), file=obj)

    return all_peptides
Ejemplo n.º 11
0
def main(string_dict):
    """Main program to extract entries containing strings from databases.
        Simple string search of pattern in combined accession/description lines.
        Logical OR if more than one pattern is mapped to the same outfile.
        Each matching protein is written once per output file with possible
            compound header (nr) of all headers containing matching patterns.
            If "cleaning" of accessions/descriptions is turned on for NCBI nr
            databases, only the first header element will be retained and
            any accession number cross-references will be lost.

    Written by Phil Wilmarth, OHSU, 2009.
    """
    print(
        '====================================================================='
    )
    print(
        ' extract_by_string.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '====================================================================='
    )

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    db_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'),
                                           ('Fasta files', '*.fasta')],
                                 title_string='Select a FASTSA database')
    if db_file == '': sys.exit()  # cancel button repsonse

    db_folder, db_name = os.path.split(db_file)
    base_name = db_name.replace('.gz', '')
    if not base_name.endswith('.fasta'):
        base_name = base_name + '.fasta'

    # create a log file to mirror screen output
    log_obj = open(os.path.join(db_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: extract_by_string.py',
                                 log_obj)

    # print the list of patterns that will be extracted
    string_list = list(string_dict.items())
    string_list.sort()
    for obj in write:
        print('...extracting entries containing these strings:', file=obj)
        for i, t in enumerate(string_list):
            print('......(%s) string "%s" to file ending in "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # open the output databases, initialize counters
    string_files = {}
    string_count = {}
    name_count = {}
    for string, name in string_dict.items():
        fname = base_name.replace('.fasta', '_' + name + '.fasta')
        fname = os.path.join(db_folder, fname)
        string_files[name] = fname
        string_count[string] = 0
        name_count[name] = 0
    for name in string_files.keys():
        string_files[name] = open(string_files[name], 'w')

    # create a FastaReader object, initialize counters, and start reading
    x = fasta_lib.FastaReader(db_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    for obj in write:
        print('...reading %s and extracting entries...' % (db_name, ),
              file=obj)
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 500000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        written = {}  # make sure protein is written only ONCE per OUTFILE
        header = prot.accession + ' ' + prot.description  # recreate the '>' line
        if not CASE_SENSITIVE:  # convert to uppercase
            header = header.upper()
        for pattern in string_dict.keys():
            new_pattern = pattern
            if not CASE_SENSITIVE:  # case insensitive matching
                new_pattern = new_pattern.upper()
            for head in header.split(chr(1)):  # check each header for matches
                if new_pattern in head:
                    name = string_dict[pattern]
                    name_header = written.get(name, '')
                    if name_header:
                        name_header = name_header + chr(1) + head
                        written[name] = name_header
                    else:
                        written[name] = head
                        string_count[pattern] += 1

        # write any matching proteins to appropriate out file
        for name in written.keys():
            name_count[name] += 1  # output file write counters
            f = string_files[name]  # output file pointers
            header = written[name]  # composite header of name's matches

            # set the accession and description fields before writing
            prot.accession = header.split()[0]
            prot.new_acc = prot.accession
            prot.description = header[(len(prot.accession) + 1):]
            prot.new_desc = prot.description
            if CLEAN_ACCESSIONS:
                if prot.accession.startswith('gi|'):
                    prot.parseNCBI(REF_SEQ_ONLY)
                elif prot.accession.startswith(
                        'sp|') or prot.accession.startswith('tr|'):
                    prot.parseUniProt(KEEP_UNIPROT_ID)
            prot.printProtein(f)  # write any matching proteins

    # close files
    for f in string_files.values():
        f.close()

    # print out the summary stuff
    for obj in write:
        print('...%s protein entries in %s' %
              ("{0:,d}".format(prot_read), db_name),
              file=obj)
        strings = list(string_count.keys())
        strings.sort()
        for i, string in enumerate(strings):
            print('......(%s) pattern "%s" was found in %s proteins' %
                  (i + 1, string, "{0:,d}".format(string_count[string])),
                  file=obj)
        print('...output file summaries...', file=obj)
        names = list(string_files.keys())
        names.sort()
        for i, name in enumerate(names):
            temp = base_name.replace('.fasta', '_' + name + '.fasta')
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]), temp),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: extract_by_string.py', log_obj)
    log_obj.close()
    return
Ejemplo n.º 12
0
def main(fasta_file, forward=False, reverse=False, both=True, log_obj=None, contam_path=""):
    """Adds contaminants and reverses entries for a FASTA protein database.

    Call with single fasta file name.
    If "forward", make sequences plus contaminants,
    if "reverse", make reversed sequences with reversed contaminants,
    if "both", make concatenated target/decoy with contaminants.
    "contam_path" is optional fullpath name of a contaminants database to use instead of default
    """
    decoy_string = 'REV_'   # the string to denote decoy sequences
    ######################################
    # Change default contaminants file name here:
    CONTAMS = 'Thermo_contams.fasta'
    # or pass in a "contams_path"
    ######################################
    
    # open the "forward" and "reversed" output files
    if fasta_file.lower().endswith('.gz'):
        _file = os.path.splitext(fasta_file[:-3])[0]
    else:
        _file = os.path.splitext(fasta_file)[0]
    for_name = _file + '_for.fasta'
    for_file_obj = open(for_name, 'w')
    rev_name = _file + '_rev.fasta'
    rev_file_obj = open(rev_name, 'w')

    # create the name for the concatenated file (if later needed)
    both_name = _file + '_both.fasta'
    
    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    if not log_obj:
        log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: reverse_fasta.py', log_obj)
    
    # create instances protein object and initialize counter
    prot = fasta_lib.Protein()
    p_read = 0
    p_contam = 0

    # try to find the contaminants database file
    # If no contam file path provided, search for it in current directory
    _file = None
    if not contam_path:
        if os.path.exists(CONTAMS):
            _file = CONTAMS
        else:
            path = os.path.split(fasta_file)[0]
            if os.path.exists(os.path.join(path, CONTAMS)):
                _file = os.path.join(path, CONTAMS)
    elif os.path.exists(contam_path) and os.path.isfile(contam_path):
        _file = contam_path
    elif os.path.isdir(contam_path) and os.path.exists(os.path.join(contam_path, CONTAMS)):
        _file = os.path.join(contam_path, CONTAMS)
        
    # create reader and add contaminants (if contams file was found)
    if _file:
        f = fasta_lib.FastaReader(_file)
        while f.readNextProtein(prot, check_for_errs=True):
            p_contam += 1
            prot.printProtein(for_file_obj)
            rev = prot.reverseProtein(decoy_string)
            rev.printProtein(rev_file_obj)
        for obj in write:
            print('...there were %s contaminant entries in %s' %
                  ("{0:,d}".format(p_contam), os.path.split(_file)[1]), file=obj)
    else:        
        for obj in write:
            print('...WARNING: contaminants were not added', file=obj)
        
    # read proteins until EOF and write proteins to "forward" and "reversed" files
    f = fasta_lib.FastaReader(fasta_file)
    
    # error checking slows program execution, turn on if needed.
    # Reading and writing sequences always removes spaces and blank lines.
    while f.readNextProtein(prot, check_for_errs=False):
        p_read += 1
        prot.printProtein(for_file_obj)    # write to "forward" file
        rev = prot.reverseProtein(decoy_string)
        rev.printProtein(rev_file_obj)   # write to "reversed" file
    for_file_obj.close()
    rev_file_obj.close()
    for obj in write:
        print('...%s proteins read from %s' %
              ("{0:,d}".format(p_read), os.path.split(fasta_file)[1]), file=obj) 
    
    # make concatenated output file if desired and print summary stats
    if both:
        both_file_obj = open(both_name, 'w')
        for_file_obj = open(for_name, 'r')
        rev_file_obj = open(rev_name, 'r')
        while True:
            line = for_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        while True:
            line = rev_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        both_file_obj.close()
        for obj in write:
            print('...%s total proteins written to %s' %
                  ("{0:,d}".format(2*(p_contam+p_read)), os.path.split(both_name)[1]), file=obj)
    
    if forward:
        for obj in write:
            print('...%s proteins written to %s' %
                  ("{0:,d}".format(p_contam+p_read), os.path.split(for_name)[1]), file=obj)
    if reverse:
        for obj in write:
            print('...%s proteins reversed and written to %s' %
                  ("{0:,d}".format(p_contam+p_read), os.path.split(rev_name)[1]), file=obj)
    
    # close files and delete unwanted files
    for_file_obj.close()
    rev_file_obj.close()
    fasta_lib.time_stamp_logfile('>>> ending: reverse_fasta.py', log_obj)
    log_obj.close()
    if not forward:
        os.remove(for_name)
    if not reverse:
        os.remove(rev_name)
    return
Ejemplo n.º 13
0
def fasta_counter(fasta_file):
    """Counts entries in a FASTA protein database.
        Call with FASTA filename.
        Checks for duplicate accessions and valid aa characters.
        Computes protein sequence lengths, molecular weights - writes to TXT file (with DB basename)
    """
    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj)

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    head = 0
    conflict = {}

    # construct summary file name based on FASTA name
    if fasta_file.endswith('.fasta.gz'):
        summary_file = re.sub(r'.fasta.gz$', r'.txt', fasta_file)
    else:
        summary_file = re.sub(r'.fasta$', r'.txt', fasta_file)
    if summary_file == fasta_file:
        summary_file = fasta_file + '.txt'

    # open summary file and write header
    summary_obj = open(summary_file, mode='wt')
    summary_obj.write('Accession\tLength\tMW\n')

    # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4
    while f.readNextProtein(p, check_for_errs=True):

        # count protein sequences
        prot += 1
        if (prot % 500000) == 0:
            print('......(%s proteins read...)' % ("{0:,d}".format(prot), ))

        # check for duplicate accession
        dup = conflict.get(p.accession, False)
        if dup:
            for obj in write:
                print('\n...WARNING: %s is already in FASTA database!\n' %
                      (p.accession, ),
                      file=obj)
                if p.molwtProtein(show_errs=False) == conflict[p.accession]:
                    print('......possible duplicated sequence...', file=obj)
        else:
            conflict[p.accession] = p.molwtProtein(show_errs=False)

        # count number of header elements
        control_A = p.description.count(chr(1))
        head = head + control_A + 1

        # add info to summary_file
        print('\t'.join([
            p.accession,
            str(p.seqlenProtein()),
            str(round(p.molwtProtein(), 1))
        ]),
              file=summary_obj)

    # print results and return
    for obj in write:
        print('...there are %s proteins in %s' %
              ("{0:,d}".format(prot), os.path.split(fasta_file)[1]),
              file=obj)
        if head > prot:
            print('...there were %s header lines' % ("{0:,d}".format(head), ),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj)
    log_obj.close()
    summary_obj.close()
    return
Ejemplo n.º 14
0
def fasta_digester(fasta_file, enzyme='trypsin', low_mass=500.0, high_mass=5000.0,
                   min_length=7, missed_cleavages=2, mass_type='mono', log=None):
    """Trypsin digests entries in a FASTA protein database.
        Call with FASTA filename, returns list of proteins with
        theoretical tryptic digest peptide lists
        Checks for duplicate accessions and (optional) valid characters.
    """
    print('=======================================================================')
    print(' fasta_digest_unique.py, v 1.0, written by Phil Wilmarth, OHSU, 2021 ')
    print('=======================================================================')

    # compile the regex for desired digestion
    if enzyme.upper() == 'No_enzyme'.upper():
        regex = re.compile(r".")
    elif enzyme.upper() == 'trypsin'.upper(): # checked
        regex = re.compile(r".(?:(?<![KR](?!P)).)*")
    elif enzyme.upper() == 'trypsin-P'.upper(): # checked
        regex = re.compile(r".(?:(?<![KR]).)*")
    elif enzyme.upper() == 'Lys-C'.upper(): # checked
        regex = re.compile(r".(?:(?<![K](?!P)).)*")
    elif enzyme.upper() == 'Lys-C-P'.upper(): # checked
        regex = re.compile(r".(?:(?<![K]).)*")
    elif enzyme.upper() == 'Lys-N'.upper(): # checked
        regex = re.compile(r".(?:(?![K]).)*")
    elif enzyme.upper() == 'Arg-C'.upper(): # checked
        regex = re.compile(r".(?:(?<![R](?!P)).)*")
    elif enzyme.upper() == 'Asp-N'.upper(): # checked
        regex = re.compile(r".(?:(?![D]).)*")
    elif enzyme.upper() == 'CNBr'.upper(): # checked
        regex = re.compile(r".(?:(?<![M]).)*")
    elif enzyme.upper() == 'Glu-C'.upper(): # checked
        regex = re.compile(r".(?:(?<![DE](?!P)).)*")
    elif enzyme.upper() == 'PepsinA'.upper(): # checked
        regex = re.compile(r".(?:(?<![FL](?!P)).)*")
    elif enzyme.upper() == 'chymotrypsin'.upper(): # checked
        regex = re.compile(r".(?:(?<![FWYL](?!P)).)*")
    else:
        print('...WARNING: Enzyme:', enzyme, 'not recognized')
        regex = None

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    proteins = []
    all_peptides = {}
    print('starting file reading:', time.ctime())

    # read proteins until EOF; NOTE: checking for errors slows program by factor of 3 or 4
    while f.readNextProtein(p, check_for_errs=False):

        # digest protein sequence (regex expression, low mass cutoff, high mas cutoff,
        # minimum peptide length, maximum number of missed cleavages, type of masses)
        p.enzymaticDigest(regex, low_mass, high_mass,
                          min_length, missed_cleavages, mass_type)

        # save all proteins that are read
        proteins.append(copy.copy(p))

        # count protein sequences
        prot += 1
        if (prot % 500000) == 0:
            print('......(%s proteins read...)' % (prot,))

    # print number of proteins/headers
    print('There are %s proteins in %s' %
          ("{0:,d}".format(prot), os.path.basename(fasta_file)), file=log)
        
    # make shared/unique status dictionary
    for p in proteins:
        for pep in p.peptides:
            # mask I and L residues
            mass_spec_seq = re.sub(r'[IL]', 'j', pep.seq)

            # make dictionary of sequences and counts
            if all_peptides.get(mass_spec_seq):
                all_peptides[mass_spec_seq].append(p.accession)
            else:
                all_peptides[mass_spec_seq] = [p.accession]

    keys = list(all_peptides.keys())
    print(keys[0], all_peptides[keys[0]])

    # print table (peptides from each protein, start, end, unique or not, protein list)
    print('\nAccession\tPeptide\tStart\tEnd\tMass\tMissed_Cleavages\tUnique\tOther_Proteins', file=log)

    for p in proteins:
        for pep in p.peptides:
            out_list = [p.accession]
            out_list += [pep.seq, str(pep.beg), str(pep.end), '%0.2f' % pep.mass, str(pep.missed)]
            mass_spec_seq = re.sub(r'[IL]', 'j', pep.seq)
            if len(all_peptides[mass_spec_seq]) == 1:
                out_list.append('TRUE')
            else:
                out_list.append('FALSE')

            acc_list = all_peptides[mass_spec_seq]
            if len(acc_list) == 1:
                acc_list = [' ']
            else:
                acc_list.remove(p.accession) 
                
            out_list += ['; '.join(acc_list)]

            # print table rows
            print('\t'.join(out_list), file=log)
            
    return
Ejemplo n.º 15
0
def main(taxon_dict):
    """Main program to extract entries by taxon ID from NCBI nr databases.
        Each gi number (of each header) is looked up to find associated taxon
        number for comparison to desired taxon numbers.  A separate protein
        entry will be written for each desired taxon number even if all taxon
        numbers are written to the same output file.  At the protein level, the
        extracted databases may no longer be non-redundant.  If "cleaning" of
        accessions/descriptions is turned off, all headers matching the desired
        taxon numbers will be added to the respective protein preserving the
        usual NCBI nr formatting structure.  If cleaning of accessions is turned
        on during extraction, some information may be lost.  This could make
        subsequent database processing (such as extracting by text string) fail.
        Cleaning is best done as a last step (i.e. in "reverse_fasta.py").
    """
    print(
        '====================================================================')
    print(
        ' nr_extract_taxon.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print(
        '====================================================================')

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    nr_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'),
                                           ('Fasta files', '*.fasta')],
                                 title_string='Select an NCBI nr database')
    if nr_file == '': sys.exit()  # cancel button response

    ncbi_folder, nr_name = os.path.split(nr_file)
    nr_db = os.path.splitext(nr_name)[0]

    # create a log file to mirror screen output
    log_obj = open(os.path.join(ncbi_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: nr_extract_taxon.py',
                                 log_obj)

    # get the saved gi number to taxon number {int:int} dictionary
    acc_to_taxon = fasta_lib.AccToTaxon(ncbi_folder)
    acc_to_taxon.create_or_load(ncbi_folder)

    # print the list of taxon numbers that will be extracted
    original_dict = taxon_dict
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers.  NOTE: if a taxon number appears in
    # "nr_fasta_analyze.txt", it will not be expanded.  Either delete the
    # line in "nr_fasta_analyze.txt", or make an expanded "taxon_dict" by hand.
    if EXPAND_GROUPS:
        fasta_lib.expand_species(ncbi_folder, 'nr', taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT,
                                 REF_SEQ_ONLY)

    # open the output databases, initialize counters, etc.
    taxon_files = {}
    taxon_count = {}
    name_count = {}
    for taxon, name in taxon_dict.items():
        fname = nr_db + '_' + name + '.fasta'
        fname = os.path.join(ncbi_folder, fname)
        taxon_files[name] = fname
        name_count[name] = 0
        taxon_count[taxon] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # loop over all proteins in nr
    x = fasta_lib.FastaReader(nr_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    not_found = 0
    skipped = 0
    for obj in write:
        print('...reading %s and extracting entries...' % (nr_name, ),
              file=obj)

    # checking for errors slows down program by about a factor of 3 or 4
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 1000000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        written = {}
        line = prot.accession + ' ' + prot.description
        prot.new_desc = ''

        # extract the gi numbers for each header
        for header in line.split(chr(1)):
            accession_with_version = header.split()[0]
            accession = accession_with_version.split('.')[0]
            if REF_SEQ_ONLY and '_' not in accession:
                continue  # skip proteins without RefSeq entries
            taxon = acc_to_taxon.get(accession, False)

            # see if taxon number for this gi is in our desired list
            if taxon:
                if taxon_dict.get(taxon, False):
                    if written.get(taxon, False):
                        # if taxon number already seen, add to header
                        prot = written[taxon]
                        prot.description = prot.description + chr(1) + header
                        written[taxon] = copy.deepcopy(prot)
                    else:
                        # first time taxon number seen
                        name = taxon_dict[taxon]
                        prot.accession = header.split()[0]
                        prot.description = header[len(prot.accession) + 1:]
                        prot.description = prot.description.rstrip()
                        taxon_count[taxon] += 1
                        name_count[name] += 1
                        written[taxon] = copy.deepcopy(prot)
                else:
                    skipped += 1
            else:
                not_found += 1
                continue

        # write a protein sequence for each taxon number it was matched to
        for taxon in written.keys():
            name = taxon_dict[taxon]
            f = taxon_files[name]
            prot = written[taxon]
            prot.new_desc = prot.description
            prot.new_acc = prot.accession
            if CLEAN_ACCESSIONS:
                prot.parseNCBI(REF_SEQ_ONLY)
            prot.printProtein(f)

    # print out number of matches and close files
    for obj in write:
        print('...%s proteins in %s' % ("{0:,d}".format(prot_read), nr_name),
              file=obj)
        print('...%s accessions did not have known taxon numbers' %
              ("{0:,d}".format(not_found), ),
              file=obj)
        print('...%s accessions were skipped (not in our taxon list)' %
              ("{0:,d}".format(skipped), ),
              file=obj)
        if REF_SEQ_ONLY:
            print('...Extracted sequences are RefSeq Only!!!', file=obj)
        if VERBOSE:
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon number %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...output file summaries...', file=obj)
        names = list(taxon_files.keys())
        names.sort()
        for i, name in enumerate(names):
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(
                      name_count[name]), nr_db + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: nr_extract_taxon.py', log_obj)
    log_obj.close()
    for f in taxon_files.values():
        f.close()
    return
def main(taxon_dict):
    """Main program to extract entries by taxon ID from uniprot databases.
    Extraction is from a single downloaded Sprot or Trembl database.
    """
    print(
        '============================================================================'
    )
    print(
        ' uniprot_extract_from_one.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '============================================================================'
    )

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    uniprot_file = fasta_lib.get_file(
        default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')],
        title_string='Select an Sprot or Trembl database')
    if uniprot_file == '': sys.exit()  # cancel button repsonse

    uniprot_folder, uniprot_name = os.path.split(uniprot_file)
    version = uniprot_name.split('_')[-1]
    version = version.replace('.fasta.gz', '')
    uniprot_db = uniprot_name.split('_')[1]

    # create a log file to mirror screen output
    log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: uniprot_extract_from_one.py',
                                 log_obj)

    # make the smaller uniprot dictionaries
    (sci_to_taxon,
     id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder)

    # make the more complete dictionary
    name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder)

    # print the list of taxon numbers that will be extracted
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers
    # NOTE: Any taxon numbers present in analysis text file will not be expanded.
    if EXPAND_GROUPS:
        fasta_lib.expand_species(uniprot_folder, uniprot_db, taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT)

    # inititalize dictionaries and counters
    taxon_files, taxon_count, name_count = {}, {}, {}
    for taxon, name in taxon_dict.items():
        fname = uniprot_db + '_' + version + '_' + name + '.fasta'
        fname = os.path.join(uniprot_folder, fname)
        taxon_files[name] = fname
        taxon_count[taxon] = 0
        name_count[name] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # create a FastaReader object, initialize counters, and start reading
    x = fasta_lib.FastaReader(uniprot_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    not_found = 0
    duplicates = {}
    for obj in write:
        print('...reading %s and extracting entries...' % (uniprot_name, ),
              file=obj)

    # checking for errors in sequences slows program execution, use as needed
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 500000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        (spec_id,
         spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' +
                                                   prot.description)
        taxon = sci_to_taxon.get(spec_name, 0)  # first choice mapping
        taxon2 = name_to_taxon.get(spec_name, 0)  # alternative mapping
        if taxon == 0:  # first choice not present
            if taxon2 == 0:
                not_found += 1
            else:
                taxon = taxon2  # use second choice
        else:
            if (taxon != taxon2) and (
                    taxon2 > 0):  #keep track of multiple taxon numbers
                duplicates[spec_name] = (taxon, taxon2)
        if taxon_dict.get(taxon, False):
            if CLEAN_ACCESSIONS:
                prot.parseUniProt()

            # taxon number matches, so write the protein to the respective file
            name = taxon_dict[taxon]
            name_count[name] += 1
            taxon_count[taxon] += 1
            f = taxon_files[name]
            prot.printProtein(f)

    # close the extracted database files
    for f in taxon_files.values():
        f.close()

    # print list of mis-matching taxon number warnings
    if MISMATCHES:
        for i, (name, pair) in enumerate(duplicates.items()):
            for obj in write:
                print('......(%s) WARNING: %s and %s map to "%s"' %
                      (i + 1, pair[0], pair[1], name),
                      file=obj)

    # print out the summary stuff
    for obj in write:
        print('...%s protein entries in %s' %
              ("{0:,d}".format(prot_read), uniprot_name),
              file=obj)
        print('...%s proteins had unknown taxon numbers' % (not_found, ),
              file=obj)
        if VERBOSE:
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...output file summaries...', file=obj)
        names = list(taxon_files.keys())
        names.sort()
        for i, name in enumerate(names):
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]),
                   uniprot_db + '_' + version + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_one.py',
                                 log_obj)
    log_obj.close()
    return