Esempio n. 1
0
def adjust_fasta(file_list, dest, nm=None):

    print_col("Adjusting proteome files", GREEN, 1)

    # Create compliant fasta directory
    cf_dir = join(dest, "backstage_files", "compliantFasta")
    if not os.path.exists(cf_dir):
        os.makedirs(cf_dir)
    else:
        for f in os.listdir(cf_dir):
            os.remove(join(cf_dir, f))

    # Setup progress information
    if nm:
        if nm.stop:
            KillByUser("")
        # Get total number of files for total progress
        nm.total = len(file_list)
        nm.counter = 0

    for proteome in file_list:
        # Get code for proteome
        code_name = proteome.split(os.path.sep)[-1].split(".")[0]
        code_name = "_".join(code_name.split())

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1
            nm.msg = "Adjusting file {}".format(basename(proteome))

        # Check the unique ID field
        try:
            unique_id = check_unique_field(proteome, True, nm)
        except Exception as e:
            print_col("The file {} could not be parsed".format(proteome),
                      YELLOW, 1)
            #TODO: Log errors on file
            continue

        # Adjust fasta
        # stg = prep_fasta(proteome, code_name, unique_id)
        prep_fasta(proteome, code_name, unique_id, dest, nm)

        protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \
                            ".fasta"
        protome_file_name = "_".join(protome_file_name.split())

        pfile = basename(proteome.split(".")[0] + "_mod.fas")
        shutil.move(join(dest, "backstage_files", pfile),
                    join(cf_dir, protome_file_name))

    json_f = join(dest, "backstage_files", "header_mapping.json")
    header_f = join(dest, "backstage_files", "header_mapping.csv")
    if os.path.exists(json_f):
        with open(json_f) as fh, open(header_f, "w") as ofh:
            header_map = json.load(fh)

            for k, v in header_map.items():
                ofh.write("{}; {}\n".format(k, v))
Esempio n. 2
0
def execute(db_dir, nm=None):
    con = lite.connect(os.path.join(db_dir, "orthoDB.db"))

    with con:

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.total = 4
            nm.counter = 0
            nm.msg = None

        con.create_function("log", 1, log)

        cur = con.cursor()

        for func in [commonTempTables, orthologs, inparalogs, coorthologs]:

            if nm:
                if nm.stop:
                    raise KillByUser("")
                nm.counter += 1

            func(cur)

    con.close()
Esempio n. 3
0
def mcl_groups(inflation_list, mcl_prefix, start_id, group_file, dest,
                nm=None):

    print_col("Dumping groups", GREEN, 1)

    # Create a results directory
    results_dir = join(dest, "Orthology_results")
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    mcl_output = join(dest, "backstage_files", "mclOutput_")

    if nm:
        if nm.stop:
            raise KillByUser("")
        nm.total = len(inflation_list)
        nm.counter = 0

    for val in inflation_list:

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1

        MclGroups.mcl_to_groups(
            mcl_prefix,
            start_id,
            mcl_output + val.replace(".", ""),
            os.path.join(results_dir, group_file + "_" + str(val) + ".txt"),
            nm=nm)
Esempio n. 4
0
def get_pairs(dest="./", ns=None):
    """
    Parses the output of USEARCH and creates a dictionary with the header
    pairs between original protein and transcripts
    """

    file_h = open(join(dest, "pairs.out"))
    pair_db = {}

    if ns:
        if ns.stop:
            raise KillByUser("")
        p = 0
        with open(join(dest, "pairs.out")) as f:
            for p, _ in enumerate(f):
                pass
        ns.max_pb = p + 1
        ns.progress = 0

    for l in file_h:

        if ns:
            if ns.stop:
                raise KillByUser("")
            ns.progress += 1

        fields = l.split("\t")
        pair_db[fields[0]] = fields[1]

    file_h.close()
    return pair_db
Esempio n. 5
0
def create_db(f_list, dest="./", ns=None):
    """
    Creates a fasta database file containing the translated protein sequences
    from the cds files. The final transcripts.fas file will be use
    by USEARCH to get matches between the original protein sequences and their
    nucleotide counterparts. A dictionary database will also be created where
    the transcript headers will be associated with the original DNA sequence,
    so that they will be later retrieved
    :param f_list. List, containing the file names of the transcript files
    """

    output_handle = open(join(dest, "transcripts.fas"), "w")
    id_dic = {}

    if ns:
        if ns.stop:
            raise KillByUser("")

        ns.progress = 0
        ns.max_pb = len(f_list)

    for f in f_list:
        handle = open(f)
        seq = ""
        header = ""

        if ns:
            if ns.stop:
                raise KillByUser("")
            ns.progress += 1

        for line in handle:

            if ns:
                if ns.stop:
                    raise KillByUser("")

            if line.startswith(">"):
                if seq != "":
                    aa_seq = translate(seq)
                    output_handle.write(">%s\n%s\n" % (header, aa_seq))
                    id_dic[header] = seq

                header = line.strip()[1:].replace(" ", ";;")
                seq = ""
            else:
                seq += line.strip()

    output_handle.close()

    return id_dic
Esempio n. 6
0
def execute(db_dir, dest, nm=None):

    con = lite.connect(os.path.join(db_dir, "orthoDB.db"))

    # Set up progression information
    if nm:
        if nm.stop:
            raise KillByUser("")
        nm.total = 4
        nm.counter = 0

    with con:

        cur = con.cursor()

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter = 1

        printOrthologsFile(cur,
                           os.path.join(dest, "backstage_files",
                                        "orthologs.txt"),
                           nm=nm)
        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter = 2

        printInparalogsFile(cur,
                            os.path.join(dest, "backstage_files",
                                         "inparalogs.txt"),
                            nm=nm)
        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter = 3
        printCoOrthologsFile(cur,
                             os.path.join(dest, "backstage_files",
                                          "coorthologs.txt"),
                             nm=nm)
        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter = 4
        printMclAbcFile(cur,
                        os.path.join(dest, "backstage_files", "mclInput"),
                        nm=nm)

    con.close()
def check_unique_field(proteome_file, verbose=False, nm=None):
    """
    Checks the original proteome file for a field in the fasta header
    that is unique to all sequences
    """

    # Some files may have utf8 encoding problems so I used codecs here
    file_handle = codecs.open(proteome_file, "r", "cp1252")
    header_list = []

    header = ""
    for line in file_handle:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            header = line[1:].strip()
            # Store header in list format
            header_list.append(header.split("|"))

    # Get the size of the header fields
    header_field_size = len(header.split("|"))

    for i in range(header_field_size):

        if nm:
            if nm.stop:
                raise KillByUser("")

        temp_list = []
        for header in header_list:
            temp_list.append(header[i])

        if len(temp_list) == len(set(temp_list)) and len(set(temp_list)) ==\
                len(header_list):

            # The orthoMCL program uses an index starting from 1, so the +1 is
            #  a necessary adjustment
            if verbose:
                print_col("\t Using unique header field {}".format(i), GREEN,
                          1)
            return i

    # Ideally, a unique field should be found before this code. If not, raise
    #  exception
    raise NoUniqueField("The proteome file {} has no unique field".format(
        os.path.basename(proteome_file)))
Esempio n. 8
0
def printMclAbcFile(cur, filename, nm=None):

    cur.execute("select sequence_id_a, sequence_id_b, normalized_score\
        from InParalog\
        union\
        select sequence_id_a, sequence_id_b, normalized_score\
        from Ortholog\
        union\
        select sequence_id_a, sequence_id_b, normalized_score\
        from CoOrtholog")

    file_fh = open(filename, "w")

    with file_fh:
        while True:

            if nm:
                if nm.stop:
                    raise KillByUser("")

            row = cur.fetchone()
            if row is None:
                break

            file_fh.write("{}\t{}\t{}\n".format(
                row[0], row[1], str((float(row[2]) * 1000 + .5) / 1000)))
Esempio n. 9
0
def convert_protein_file(pairs, group_obj, id_db, output_dir, shared_ns):
    """
    A given protein file will be converted into their corresponding nucleotide
    sequences using a previously set database using the create_db function
    :return:
    """

    # Create handle for file storing bad sequence headers.
    bad_file = open(join(output_dir, "missed_sequences.log"), "w")

    for line, cl in zip(group_obj.groups(),
                        group_obj.iter_species_frequency()):

        if shared_ns:
            if shared_ns.stop:
                raise KillByUser("")

        if group_obj._get_compliance(cl) == (1, 1):

            line = group_obj._remove_tx(line)

            fields = line.split(":")
            orto_name = fields[0]
            seq_headers = fields[-1].split()

            f_handle = open(join(output_dir, orto_name) + ".fas", "w")

            for h in seq_headers:
                if h in pairs:
                    seq = id_db[pairs[h]]
                    shared_ns.good += 1
                    f_handle.write(">%s\n%s\n" % (h.replace(";;", " "), seq))
                else:
                    shared_ns.missed += 1
                    bad_file.write("{}\t{}\n".format(orto_name, h))
Esempio n. 10
0
def adjust_fasta(file_list, dest, nm=None):

    print_col("Adjusting proteome files", GREEN, 1)

    # Create compliant fasta directory
    cf_dir = join(dest, "backstage_files", "compliantFasta")
    if not os.path.exists(cf_dir):
        os.makedirs(cf_dir)
    else:
        for f in os.listdir(cf_dir):
            os.remove(join(cf_dir, f))

    # Setup progress information
    if nm:
        if nm.stop:
            KillByUser("")
        # Get total number of files for total progress
        nm.total = len(file_list)
        nm.counter = 0

    for proteome in file_list:
        # Get code for proteome
        code_name = proteome.split(os.path.sep)[-1].split(".")[0]

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1
            nm.msg = "Adjusting file {}".format(basename(proteome))

        # Check the unique ID field
        unique_id = check_unique_field(proteome, True, nm)

        # Adjust fasta
        # stg = prep_fasta(proteome, code_name, unique_id)
        prep_fasta(proteome, code_name, unique_id, nm)

        protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \
                            ".fasta"

        shutil.move(
            proteome.split(".")[0] + "_mod.fas", join(cf_dir,
                                                      protome_file_name))
def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None):

    if verbose:
        print_col("\t Preparing file for USEARCH", GREEN, 1)

    # Storing header list to check for duplicates
    header_list = []

    # Get json with header mappings, if exists
    json_f = join(dest, "backstage_files", "header_mapping.json")
    if os.path.exists(json_f):
        with open(json_f) as fh:
            header_mapping = json.load(fh)
    else:
        header_mapping = {}

    # Will prevent writing
    lock = True

    # File handles
    file_in = open(proteome_file)
    pfile = basename(proteome_file.split(".")[0] + "_mod.fas")
    file_out_path = join(dest, "backstage_files", pfile)
    file_out = open(file_out_path, "w")

    for line in file_in:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            if line not in header_list:
                fields = line.split("|")
                unique_str = fields[unique_id].replace(" ", "_")
                header_mapping["%s|%s" % (code, unique_str)] = line.strip()
                header_list.append(line)
                file_out.write(">%s|%s\n" % (code, unique_str))
                lock = True
            else:
                lock = False
        elif lock:
            file_out.write(line)

    # Close file handles:
    file_in.close()
    file_out.close()

    with open(json_f, "w") as fh:
        json.dump(header_mapping, fh)
Esempio n. 12
0
def mcl_to_groups(prefix, start_id, infile, outfile, nm=None):

    try:
        start_id = int(start_id)
    except ValueError:
        raise ValueError("StartId is not a number")

    input_file = open(infile, "r")
    out = open(outfile, "w")

    for line in input_file:

        if nm:
            if nm.stop:
                raise KillByUser("")

        out.write(prefix + str(start_id) + ": " + line)
        start_id += 1
def export_filtered_groups(inflation_list,
                           group_prefix,
                           gene_t,
                           sp_t,
                           sqldb,
                           db,
                           tmp_dir,
                           dest,
                           nm=None):

    print_col("Exporting filtered groups to protein sequence files", GREEN, 1)

    stats_storage = {}
    groups_obj = OT.MultiGroupsLight(tmp_dir)

    if nm:
        if nm.stop:
            raise KillByUser("")

    for val in inflation_list:
        # Create a directory that will store the results for the current
        # inflation value
        inflation_dir = join(dest, "Orthology_results", "Inflation%s" % val)
        if not os.path.exists(inflation_dir):
            os.makedirs(inflation_dir)

        group_file = join(dest, "Orthology_results",
                          group_prefix + "_%s.txt" % val)

        # Create Group object
        group_obj = OT.GroupLight(group_file, gene_t, sp_t)
        # Add group to the MultiGroups object
        groups_obj.add_group(group_obj)
        # Export filtered groups and return stats to present in the app
        stats = group_obj.basic_group_statistics()
        # Retrieve fasta sequences from the filtered groups
        group_obj.retrieve_sequences(sqldb,
                                     db,
                                     dest=join(inflation_dir, "Orthologs"),
                                     shared_namespace=nm)
        # os.remove(sqldb)
        stats_storage[val] = stats

    return stats_storage, groups_obj
Esempio n. 14
0
def prep_fasta(proteome_file, code, unique_id, verbose=False, nm=None):

    if verbose:
        print_col("\t Preparing file for USEARCH", GREEN, 1)

    # Storing header list to check for duplicates
    header_list = []

    # Storing dictionary with header and sequence for later use
    seq_storage = {}

    # Will prevent writing
    lock = True

    # File handles
    file_in = open(proteome_file)
    file_out = open(proteome_file.split(".")[0] + "_mod.fas", "w")

    for line in file_in:

        if nm:
            if nm.stop:
                raise KillByUser("")

        if line.startswith(">"):
            if line not in header_list:
                fields = line.split("|")
                unique_str = fields[unique_id].replace(" ", "_")
                seq_storage["%s|%s" % (code, unique_str)] = ""
                header_list.append(line)
                file_out.write(">%s|%s\n" % (code, unique_str))
                lock = True
            else:
                lock = False
        elif lock:
            seq_storage["%s|%s" % (code, unique_str)] += line.strip()
            file_out.write(line)

    # Close file handles:
    file_in.close()
    file_out.close()

    return seq_storage
Esempio n. 15
0
def printCoOrthologsFile(cur, filename, nm=None):

    cur.execute(
        "select taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b, normalized_score\
        from CoOrtholog\
        order by taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b asc")

    file_fh = open(filename, "w")

    with file_fh:
        while True:

            if nm:
                if nm.stop:
                    raise KillByUser("")

            row = cur.fetchone()
            if row is None:
                break

            file_fh.write("{}\t{}\t{}\n".format(
                row[2], row[3], str((float(row[4]) * 1000 + .5) / 1000)))
Esempio n. 16
0
def orthomcl_filter_fasta(input_dir, min_length, max_stop_percent, db, dest,
                             nm=None):

    def handle_seq(seq, length, stop_cnt):
        is_bad = 0
        stop_percent = ((length - stop_cnt) / length) * 100

        if length < min_length or stop_percent > max_stop_percent:
            bad.write(seq + "\n")
            is_bad = 1
        else:
            good.write(seq + "\n")

        return is_bad

    good = open(os.path.join(dest, "backstage_files", db), "w")
    bad = open(os.path.join(dest, "backstage_files", "poorProteins.txt"), "w")

    filenames = [os.path.join(input_dir, x) for x in os.listdir(input_dir)]

    reject_rates = []

    # Setup progression information
    if nm:
        if nm.stop:
            raise KillByUser("")
        nm.total = len(filenames)
        nm.counter = 0

    for filename in filenames:

        if nm:
            if nm.stop:
                raise KillByUser("")
            nm.counter += 1
            nm.msg = "Filtering file {}".format(os.path.basename(filename))

        if filename.startswith('.'):
            continue

        input_file = open(filename, 'r')
        seq_count = 0
        reject_seq_count = 0
        current_seq = ""
        current_len = 0
        current_stop_cnt = 0

        # process lines of one file
        for line in input_file:

            if nm:
                if nm.stop:
                    raise KillByUser("")

            if line.startswith('>'):
                if current_seq:
                    seq_count += 1
                    reject_seq_count += handle_seq(current_seq,
                                                   current_len,
                                                   current_stop_cnt)
                    current_seq = ""
                    current_len = 0
                    current_stop_cnt = 0
            else:
                line_len = len(line)
                current_len += line_len
                line = re.sub('[^A-Za-z]', '', line)
                current_stop_cnt += line_len - len(line)

            current_seq += line

        reject_seq_count += handle_seq(current_seq,
                                       current_len,
                                       current_stop_cnt)
        seq_count += 1

        # add file stats to reject count if it qualifies
        if reject_seq_count:
            pct = reject_seq_count / seq_count * 100
            if pct > 10:
                reject_rates.append([input_file, pct])

        input_file.close()

    good.close()
    bad.close()
def orthomcl_blast_parser(blast_file, fasta_dir, db_dir, nm):

    # create connection to DB
    con = lite.connect(os.path.join(db_dir, "orthoDB.db"))
    with con:
        #global cur
        cur = con.cursor()

        prev_subjectid = ''
        prev_queryid = ''
        # hash to hold subject info
        subject = {}

        # Set progress information
        if nm:
            if nm.stop:
                raise KillByUser("")
            total = 0
            for total, _ in enumerate(open(blast_file)):
                pass
            nm.total = total
            nm.msg = None
            nm.counter = 0

        # parse fasta files
        genes = get_genes(fasta_dir)
        blast_fh = open(blast_file, "r")

        for line in blast_fh:

            if nm:
                if nm.stop:
                    raise KillByUser("")
                nm.counter += 1

            splitted = line.split()

            query_id = splitted[0]
            subject_id = splitted[1]
            percent_identity = splitted[2]
            length = int(splitted[3])
            query_start = splitted[6]
            query_end = splitted[7]
            subject_start = splitted[8]
            subject_end = splitted[9]
            evalue = splitted[10]

            if query_id != prev_queryid or subject_id != prev_subjectid:

                # print previous subject
                if subject:
                    print_previous_subject(subject, cur)

                # initialize new one from first HSP
                prev_subjectid = subject_id
                prev_queryid = query_id

                # from first hsp
                tup = format_evalue(evalue)

                subject = {"queryId": query_id}
                subject["subjectId"] = subject_id
                subject["queryShorter"] = get_taxon_and_length(subject, genes)

                subject["evalueMant"] = tup[0]
                subject["evalueExp"] = tup[1]
                subject["totalIdentities"] = 0
                subject["totalLength"] = 0
                subject["hspspans"] = []

            # get additional info from subsequent HSPs
            hspspan = (subject_start, subject_end)
            if subject and subject["queryShorter"]:
                hspspan = (query_start, query_end)
            subject["hspspans"].append(hspspan)
            subject["totalIdentities"] += float(percent_identity) * length
            subject["totalLength"] += length

        print_previous_subject(subject, cur)

    con.close()
Esempio n. 18
0
def orto_execution(nm, temp_dir, proteome_files, protein_min_len,
                   protein_max_stop, usearch_file, usearch_evalue,
                   usearch_threads, usearch_output, mcl_file, mcl_inflation,
                   ortholog_prefix, group_prefix, orto_max_gene, orto_min_sp,
                   sqldb, ortho_dir, usearch_db):
    """
    Executes all pipeline subprocesses sequentially and updates the
    Progess dialog label
    """

    try:
        nm.finished_tasks = []

        nm.task = "schema"
        ortho_pipe.install_schema(temp_dir)
        nm.finished_tasks = ["schema"]

        if nm.stop:
            raise KillByUser("")

        nm.task = "adjust"
        ortho_pipe.adjust_fasta(proteome_files, ortho_dir, nm)
        nm.finished_tasks = ["schema", "adjust"]

        if nm.stop:
            raise KillByUser("")

        nm.task = "filter"
        ortho_pipe.filter_fasta(protein_min_len, protein_max_stop, usearch_db,
                                ortho_dir, nm)
        nm.finished_tasks = ["schema", "adjust", "filter"]

        if nm.stop:
            raise KillByUser("")

        nm.task = "usearch"
        ortho_pipe.allvsall_usearch(usearch_db,
                                    usearch_evalue,
                                    ortho_dir,
                                    usearch_threads,
                                    usearch_output,
                                    usearch_bin=usearch_file,
                                    nm=nm)
        nm.finished_tasks = ["schema", "adjust", "filter", "usearch"]

        if nm.stop:
            raise KillByUser("")

        nm.task = "parse"
        ortho_pipe.blast_parser(usearch_output,
                                ortho_dir,
                                db_dir=temp_dir,
                                nm=nm)
        nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse"]

        if nm.stop:
            raise KillByUser("")

        nm.task = "pairs"
        ortho_pipe.pairs(temp_dir, nm=nm)
        ortho_pipe.dump_pairs(temp_dir, ortho_dir, nm=nm)
        nm.finished_tasks = [
            "schema", "adjust", "filter", "usearch", "parse", "pairs"
        ]

        if nm.stop:
            raise KillByUser("")

        nm.task = "mcl"
        ortho_pipe.mcl(mcl_inflation, ortho_dir, mcl_file=mcl_file, nm=nm)
        nm.finished_tasks = [
            "schema", "adjust", "filter", "usearch", "parse", "pairs", "mcl"
        ]

        if nm.stop:
            raise KillByUser("")

        nm.task = "dump"
        ortho_pipe.mcl_groups(mcl_inflation,
                              ortholog_prefix,
                              "1000",
                              group_prefix,
                              ortho_dir,
                              nm=nm)
        nm.finished_tasks = [
            "schema", "adjust", "filter", "usearch", "parse", "pairs", "mcl",
            "dump"
        ]

        if nm.stop:
            raise KillByUser("")

        nm.task = "filter_groups"
        stats, groups_obj = ortho_pipe.export_filtered_groups(
            mcl_inflation,
            group_prefix,
            orto_max_gene,
            orto_min_sp,
            sqldb,
            join(ortho_dir, "backstage_files", usearch_db),
            temp_dir,
            ortho_dir,
            nm=nm)
        nm.finished_tasks = [
            "schema", "adjust", "filter", "usearch", "parse", "pairs", "mcl",
            "dump", "filter_groups"
        ]

        if nm.stop:
            raise KillByUser("")

        # stats is a dictionary containing the inflation value as
        #  key and a list with the orthologs as value
        nm.stats = stats
        nm.groups = groups_obj

    except KillByUser:
        return

    except IOError as e:
        nm.exception = str(e)
        print(e)
        return

    except Exception as e:
        logging.exception("Unexpected exit in Orthology search")
        nm.exception = str(e)
Esempio n. 19
0
def convert_group(sqldb,
                  cds_file_list,
                  protein_db,
                  group_sequences,
                  usearch_bin,
                  output_dir,
                  shared_namespace=None):
    """
    Convenience function that wraps all required operations to convert protein
    to nucleotide files from a Group object
    """

    if shared_namespace:
        shared_namespace.act = "Creating database"
        shared_namespace.missed = 0
        shared_namespace.good = 0
    # Create database
    id_db = create_db(cds_file_list, output_dir, shared_namespace)

    if shared_namespace:
        shared_namespace.act = "Creating query"

        # Kill switch
        if shared_namespace.stop:
            raise KillByUser("")

    # Create query for USEARCH
    group_sequences.retrieve_sequences(sqldb,
                                       protein_db,
                                       output_dir,
                                       outfile="query.fas",
                                       shared_namespace=shared_namespace)

    if shared_namespace:
        # Kill switch
        if shared_namespace.stop:
            raise KillByUser("")

    # Execute search
    if shared_namespace:
        shared_namespace.act = "Performing search"
    pair_search(usearch_bin, output_dir)

    if shared_namespace:
        # Kill switch
        if shared_namespace.stop:
            raise KillByUser("")

    pair_db = get_pairs(output_dir, ns=shared_namespace)
    # Convert files

    if shared_namespace:
        shared_namespace.act = "Converting to nucleotide"
    convert_protein_file(pair_db, group_sequences, id_db, output_dir,
                         shared_namespace)

    # Remove temporary files
    temp_files = [
        join(output_dir, "query.fas"),
        join(output_dir, "transcripts.fas"),
        join(output_dir, "pairs.out")
    ]

    for f in temp_files:
        os.remove(f)