def adjust_fasta(file_list, dest, nm=None): print_col("Adjusting proteome files", GREEN, 1) # Create compliant fasta directory cf_dir = join(dest, "backstage_files", "compliantFasta") if not os.path.exists(cf_dir): os.makedirs(cf_dir) else: for f in os.listdir(cf_dir): os.remove(join(cf_dir, f)) # Setup progress information if nm: if nm.stop: KillByUser("") # Get total number of files for total progress nm.total = len(file_list) nm.counter = 0 for proteome in file_list: # Get code for proteome code_name = proteome.split(os.path.sep)[-1].split(".")[0] code_name = "_".join(code_name.split()) if nm: if nm.stop: raise KillByUser("") nm.counter += 1 nm.msg = "Adjusting file {}".format(basename(proteome)) # Check the unique ID field try: unique_id = check_unique_field(proteome, True, nm) except Exception as e: print_col("The file {} could not be parsed".format(proteome), YELLOW, 1) #TODO: Log errors on file continue # Adjust fasta # stg = prep_fasta(proteome, code_name, unique_id) prep_fasta(proteome, code_name, unique_id, dest, nm) protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \ ".fasta" protome_file_name = "_".join(protome_file_name.split()) pfile = basename(proteome.split(".")[0] + "_mod.fas") shutil.move(join(dest, "backstage_files", pfile), join(cf_dir, protome_file_name)) json_f = join(dest, "backstage_files", "header_mapping.json") header_f = join(dest, "backstage_files", "header_mapping.csv") if os.path.exists(json_f): with open(json_f) as fh, open(header_f, "w") as ofh: header_map = json.load(fh) for k, v in header_map.items(): ofh.write("{}; {}\n".format(k, v))
def execute(db_dir, nm=None): con = lite.connect(os.path.join(db_dir, "orthoDB.db")) with con: if nm: if nm.stop: raise KillByUser("") nm.total = 4 nm.counter = 0 nm.msg = None con.create_function("log", 1, log) cur = con.cursor() for func in [commonTempTables, orthologs, inparalogs, coorthologs]: if nm: if nm.stop: raise KillByUser("") nm.counter += 1 func(cur) con.close()
def mcl_groups(inflation_list, mcl_prefix, start_id, group_file, dest, nm=None): print_col("Dumping groups", GREEN, 1) # Create a results directory results_dir = join(dest, "Orthology_results") if not os.path.exists(results_dir): os.makedirs(results_dir) mcl_output = join(dest, "backstage_files", "mclOutput_") if nm: if nm.stop: raise KillByUser("") nm.total = len(inflation_list) nm.counter = 0 for val in inflation_list: if nm: if nm.stop: raise KillByUser("") nm.counter += 1 MclGroups.mcl_to_groups( mcl_prefix, start_id, mcl_output + val.replace(".", ""), os.path.join(results_dir, group_file + "_" + str(val) + ".txt"), nm=nm)
def get_pairs(dest="./", ns=None): """ Parses the output of USEARCH and creates a dictionary with the header pairs between original protein and transcripts """ file_h = open(join(dest, "pairs.out")) pair_db = {} if ns: if ns.stop: raise KillByUser("") p = 0 with open(join(dest, "pairs.out")) as f: for p, _ in enumerate(f): pass ns.max_pb = p + 1 ns.progress = 0 for l in file_h: if ns: if ns.stop: raise KillByUser("") ns.progress += 1 fields = l.split("\t") pair_db[fields[0]] = fields[1] file_h.close() return pair_db
def create_db(f_list, dest="./", ns=None): """ Creates a fasta database file containing the translated protein sequences from the cds files. The final transcripts.fas file will be use by USEARCH to get matches between the original protein sequences and their nucleotide counterparts. A dictionary database will also be created where the transcript headers will be associated with the original DNA sequence, so that they will be later retrieved :param f_list. List, containing the file names of the transcript files """ output_handle = open(join(dest, "transcripts.fas"), "w") id_dic = {} if ns: if ns.stop: raise KillByUser("") ns.progress = 0 ns.max_pb = len(f_list) for f in f_list: handle = open(f) seq = "" header = "" if ns: if ns.stop: raise KillByUser("") ns.progress += 1 for line in handle: if ns: if ns.stop: raise KillByUser("") if line.startswith(">"): if seq != "": aa_seq = translate(seq) output_handle.write(">%s\n%s\n" % (header, aa_seq)) id_dic[header] = seq header = line.strip()[1:].replace(" ", ";;") seq = "" else: seq += line.strip() output_handle.close() return id_dic
def execute(db_dir, dest, nm=None): con = lite.connect(os.path.join(db_dir, "orthoDB.db")) # Set up progression information if nm: if nm.stop: raise KillByUser("") nm.total = 4 nm.counter = 0 with con: cur = con.cursor() if nm: if nm.stop: raise KillByUser("") nm.counter = 1 printOrthologsFile(cur, os.path.join(dest, "backstage_files", "orthologs.txt"), nm=nm) if nm: if nm.stop: raise KillByUser("") nm.counter = 2 printInparalogsFile(cur, os.path.join(dest, "backstage_files", "inparalogs.txt"), nm=nm) if nm: if nm.stop: raise KillByUser("") nm.counter = 3 printCoOrthologsFile(cur, os.path.join(dest, "backstage_files", "coorthologs.txt"), nm=nm) if nm: if nm.stop: raise KillByUser("") nm.counter = 4 printMclAbcFile(cur, os.path.join(dest, "backstage_files", "mclInput"), nm=nm) con.close()
def check_unique_field(proteome_file, verbose=False, nm=None): """ Checks the original proteome file for a field in the fasta header that is unique to all sequences """ # Some files may have utf8 encoding problems so I used codecs here file_handle = codecs.open(proteome_file, "r", "cp1252") header_list = [] header = "" for line in file_handle: if nm: if nm.stop: raise KillByUser("") if line.startswith(">"): header = line[1:].strip() # Store header in list format header_list.append(header.split("|")) # Get the size of the header fields header_field_size = len(header.split("|")) for i in range(header_field_size): if nm: if nm.stop: raise KillByUser("") temp_list = [] for header in header_list: temp_list.append(header[i]) if len(temp_list) == len(set(temp_list)) and len(set(temp_list)) ==\ len(header_list): # The orthoMCL program uses an index starting from 1, so the +1 is # a necessary adjustment if verbose: print_col("\t Using unique header field {}".format(i), GREEN, 1) return i # Ideally, a unique field should be found before this code. If not, raise # exception raise NoUniqueField("The proteome file {} has no unique field".format( os.path.basename(proteome_file)))
def printMclAbcFile(cur, filename, nm=None): cur.execute("select sequence_id_a, sequence_id_b, normalized_score\ from InParalog\ union\ select sequence_id_a, sequence_id_b, normalized_score\ from Ortholog\ union\ select sequence_id_a, sequence_id_b, normalized_score\ from CoOrtholog") file_fh = open(filename, "w") with file_fh: while True: if nm: if nm.stop: raise KillByUser("") row = cur.fetchone() if row is None: break file_fh.write("{}\t{}\t{}\n".format( row[0], row[1], str((float(row[2]) * 1000 + .5) / 1000)))
def convert_protein_file(pairs, group_obj, id_db, output_dir, shared_ns): """ A given protein file will be converted into their corresponding nucleotide sequences using a previously set database using the create_db function :return: """ # Create handle for file storing bad sequence headers. bad_file = open(join(output_dir, "missed_sequences.log"), "w") for line, cl in zip(group_obj.groups(), group_obj.iter_species_frequency()): if shared_ns: if shared_ns.stop: raise KillByUser("") if group_obj._get_compliance(cl) == (1, 1): line = group_obj._remove_tx(line) fields = line.split(":") orto_name = fields[0] seq_headers = fields[-1].split() f_handle = open(join(output_dir, orto_name) + ".fas", "w") for h in seq_headers: if h in pairs: seq = id_db[pairs[h]] shared_ns.good += 1 f_handle.write(">%s\n%s\n" % (h.replace(";;", " "), seq)) else: shared_ns.missed += 1 bad_file.write("{}\t{}\n".format(orto_name, h))
def adjust_fasta(file_list, dest, nm=None): print_col("Adjusting proteome files", GREEN, 1) # Create compliant fasta directory cf_dir = join(dest, "backstage_files", "compliantFasta") if not os.path.exists(cf_dir): os.makedirs(cf_dir) else: for f in os.listdir(cf_dir): os.remove(join(cf_dir, f)) # Setup progress information if nm: if nm.stop: KillByUser("") # Get total number of files for total progress nm.total = len(file_list) nm.counter = 0 for proteome in file_list: # Get code for proteome code_name = proteome.split(os.path.sep)[-1].split(".")[0] if nm: if nm.stop: raise KillByUser("") nm.counter += 1 nm.msg = "Adjusting file {}".format(basename(proteome)) # Check the unique ID field unique_id = check_unique_field(proteome, True, nm) # Adjust fasta # stg = prep_fasta(proteome, code_name, unique_id) prep_fasta(proteome, code_name, unique_id, nm) protome_file_name = proteome.split(os.path.sep)[-1].split(".")[0] + \ ".fasta" shutil.move( proteome.split(".")[0] + "_mod.fas", join(cf_dir, protome_file_name))
def prep_fasta(proteome_file, code, unique_id, dest, verbose=False, nm=None): if verbose: print_col("\t Preparing file for USEARCH", GREEN, 1) # Storing header list to check for duplicates header_list = [] # Get json with header mappings, if exists json_f = join(dest, "backstage_files", "header_mapping.json") if os.path.exists(json_f): with open(json_f) as fh: header_mapping = json.load(fh) else: header_mapping = {} # Will prevent writing lock = True # File handles file_in = open(proteome_file) pfile = basename(proteome_file.split(".")[0] + "_mod.fas") file_out_path = join(dest, "backstage_files", pfile) file_out = open(file_out_path, "w") for line in file_in: if nm: if nm.stop: raise KillByUser("") if line.startswith(">"): if line not in header_list: fields = line.split("|") unique_str = fields[unique_id].replace(" ", "_") header_mapping["%s|%s" % (code, unique_str)] = line.strip() header_list.append(line) file_out.write(">%s|%s\n" % (code, unique_str)) lock = True else: lock = False elif lock: file_out.write(line) # Close file handles: file_in.close() file_out.close() with open(json_f, "w") as fh: json.dump(header_mapping, fh)
def mcl_to_groups(prefix, start_id, infile, outfile, nm=None): try: start_id = int(start_id) except ValueError: raise ValueError("StartId is not a number") input_file = open(infile, "r") out = open(outfile, "w") for line in input_file: if nm: if nm.stop: raise KillByUser("") out.write(prefix + str(start_id) + ": " + line) start_id += 1
def export_filtered_groups(inflation_list, group_prefix, gene_t, sp_t, sqldb, db, tmp_dir, dest, nm=None): print_col("Exporting filtered groups to protein sequence files", GREEN, 1) stats_storage = {} groups_obj = OT.MultiGroupsLight(tmp_dir) if nm: if nm.stop: raise KillByUser("") for val in inflation_list: # Create a directory that will store the results for the current # inflation value inflation_dir = join(dest, "Orthology_results", "Inflation%s" % val) if not os.path.exists(inflation_dir): os.makedirs(inflation_dir) group_file = join(dest, "Orthology_results", group_prefix + "_%s.txt" % val) # Create Group object group_obj = OT.GroupLight(group_file, gene_t, sp_t) # Add group to the MultiGroups object groups_obj.add_group(group_obj) # Export filtered groups and return stats to present in the app stats = group_obj.basic_group_statistics() # Retrieve fasta sequences from the filtered groups group_obj.retrieve_sequences(sqldb, db, dest=join(inflation_dir, "Orthologs"), shared_namespace=nm) # os.remove(sqldb) stats_storage[val] = stats return stats_storage, groups_obj
def prep_fasta(proteome_file, code, unique_id, verbose=False, nm=None): if verbose: print_col("\t Preparing file for USEARCH", GREEN, 1) # Storing header list to check for duplicates header_list = [] # Storing dictionary with header and sequence for later use seq_storage = {} # Will prevent writing lock = True # File handles file_in = open(proteome_file) file_out = open(proteome_file.split(".")[0] + "_mod.fas", "w") for line in file_in: if nm: if nm.stop: raise KillByUser("") if line.startswith(">"): if line not in header_list: fields = line.split("|") unique_str = fields[unique_id].replace(" ", "_") seq_storage["%s|%s" % (code, unique_str)] = "" header_list.append(line) file_out.write(">%s|%s\n" % (code, unique_str)) lock = True else: lock = False elif lock: seq_storage["%s|%s" % (code, unique_str)] += line.strip() file_out.write(line) # Close file handles: file_in.close() file_out.close() return seq_storage
def printCoOrthologsFile(cur, filename, nm=None): cur.execute( "select taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b, normalized_score\ from CoOrtholog\ order by taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b asc") file_fh = open(filename, "w") with file_fh: while True: if nm: if nm.stop: raise KillByUser("") row = cur.fetchone() if row is None: break file_fh.write("{}\t{}\t{}\n".format( row[2], row[3], str((float(row[4]) * 1000 + .5) / 1000)))
def orthomcl_filter_fasta(input_dir, min_length, max_stop_percent, db, dest, nm=None): def handle_seq(seq, length, stop_cnt): is_bad = 0 stop_percent = ((length - stop_cnt) / length) * 100 if length < min_length or stop_percent > max_stop_percent: bad.write(seq + "\n") is_bad = 1 else: good.write(seq + "\n") return is_bad good = open(os.path.join(dest, "backstage_files", db), "w") bad = open(os.path.join(dest, "backstage_files", "poorProteins.txt"), "w") filenames = [os.path.join(input_dir, x) for x in os.listdir(input_dir)] reject_rates = [] # Setup progression information if nm: if nm.stop: raise KillByUser("") nm.total = len(filenames) nm.counter = 0 for filename in filenames: if nm: if nm.stop: raise KillByUser("") nm.counter += 1 nm.msg = "Filtering file {}".format(os.path.basename(filename)) if filename.startswith('.'): continue input_file = open(filename, 'r') seq_count = 0 reject_seq_count = 0 current_seq = "" current_len = 0 current_stop_cnt = 0 # process lines of one file for line in input_file: if nm: if nm.stop: raise KillByUser("") if line.startswith('>'): if current_seq: seq_count += 1 reject_seq_count += handle_seq(current_seq, current_len, current_stop_cnt) current_seq = "" current_len = 0 current_stop_cnt = 0 else: line_len = len(line) current_len += line_len line = re.sub('[^A-Za-z]', '', line) current_stop_cnt += line_len - len(line) current_seq += line reject_seq_count += handle_seq(current_seq, current_len, current_stop_cnt) seq_count += 1 # add file stats to reject count if it qualifies if reject_seq_count: pct = reject_seq_count / seq_count * 100 if pct > 10: reject_rates.append([input_file, pct]) input_file.close() good.close() bad.close()
def orthomcl_blast_parser(blast_file, fasta_dir, db_dir, nm): # create connection to DB con = lite.connect(os.path.join(db_dir, "orthoDB.db")) with con: #global cur cur = con.cursor() prev_subjectid = '' prev_queryid = '' # hash to hold subject info subject = {} # Set progress information if nm: if nm.stop: raise KillByUser("") total = 0 for total, _ in enumerate(open(blast_file)): pass nm.total = total nm.msg = None nm.counter = 0 # parse fasta files genes = get_genes(fasta_dir) blast_fh = open(blast_file, "r") for line in blast_fh: if nm: if nm.stop: raise KillByUser("") nm.counter += 1 splitted = line.split() query_id = splitted[0] subject_id = splitted[1] percent_identity = splitted[2] length = int(splitted[3]) query_start = splitted[6] query_end = splitted[7] subject_start = splitted[8] subject_end = splitted[9] evalue = splitted[10] if query_id != prev_queryid or subject_id != prev_subjectid: # print previous subject if subject: print_previous_subject(subject, cur) # initialize new one from first HSP prev_subjectid = subject_id prev_queryid = query_id # from first hsp tup = format_evalue(evalue) subject = {"queryId": query_id} subject["subjectId"] = subject_id subject["queryShorter"] = get_taxon_and_length(subject, genes) subject["evalueMant"] = tup[0] subject["evalueExp"] = tup[1] subject["totalIdentities"] = 0 subject["totalLength"] = 0 subject["hspspans"] = [] # get additional info from subsequent HSPs hspspan = (subject_start, subject_end) if subject and subject["queryShorter"]: hspspan = (query_start, query_end) subject["hspspans"].append(hspspan) subject["totalIdentities"] += float(percent_identity) * length subject["totalLength"] += length print_previous_subject(subject, cur) con.close()
def orto_execution(nm, temp_dir, proteome_files, protein_min_len, protein_max_stop, usearch_file, usearch_evalue, usearch_threads, usearch_output, mcl_file, mcl_inflation, ortholog_prefix, group_prefix, orto_max_gene, orto_min_sp, sqldb, ortho_dir, usearch_db): """ Executes all pipeline subprocesses sequentially and updates the Progess dialog label """ try: nm.finished_tasks = [] nm.task = "schema" ortho_pipe.install_schema(temp_dir) nm.finished_tasks = ["schema"] if nm.stop: raise KillByUser("") nm.task = "adjust" ortho_pipe.adjust_fasta(proteome_files, ortho_dir, nm) nm.finished_tasks = ["schema", "adjust"] if nm.stop: raise KillByUser("") nm.task = "filter" ortho_pipe.filter_fasta(protein_min_len, protein_max_stop, usearch_db, ortho_dir, nm) nm.finished_tasks = ["schema", "adjust", "filter"] if nm.stop: raise KillByUser("") nm.task = "usearch" ortho_pipe.allvsall_usearch(usearch_db, usearch_evalue, ortho_dir, usearch_threads, usearch_output, usearch_bin=usearch_file, nm=nm) nm.finished_tasks = ["schema", "adjust", "filter", "usearch"] if nm.stop: raise KillByUser("") nm.task = "parse" ortho_pipe.blast_parser(usearch_output, ortho_dir, db_dir=temp_dir, nm=nm) nm.finished_tasks = ["schema", "adjust", "filter", "usearch", "parse"] if nm.stop: raise KillByUser("") nm.task = "pairs" ortho_pipe.pairs(temp_dir, nm=nm) ortho_pipe.dump_pairs(temp_dir, ortho_dir, nm=nm) nm.finished_tasks = [ "schema", "adjust", "filter", "usearch", "parse", "pairs" ] if nm.stop: raise KillByUser("") nm.task = "mcl" ortho_pipe.mcl(mcl_inflation, ortho_dir, mcl_file=mcl_file, nm=nm) nm.finished_tasks = [ "schema", "adjust", "filter", "usearch", "parse", "pairs", "mcl" ] if nm.stop: raise KillByUser("") nm.task = "dump" ortho_pipe.mcl_groups(mcl_inflation, ortholog_prefix, "1000", group_prefix, ortho_dir, nm=nm) nm.finished_tasks = [ "schema", "adjust", "filter", "usearch", "parse", "pairs", "mcl", "dump" ] if nm.stop: raise KillByUser("") nm.task = "filter_groups" stats, groups_obj = ortho_pipe.export_filtered_groups( mcl_inflation, group_prefix, orto_max_gene, orto_min_sp, sqldb, join(ortho_dir, "backstage_files", usearch_db), temp_dir, ortho_dir, nm=nm) nm.finished_tasks = [ "schema", "adjust", "filter", "usearch", "parse", "pairs", "mcl", "dump", "filter_groups" ] if nm.stop: raise KillByUser("") # stats is a dictionary containing the inflation value as # key and a list with the orthologs as value nm.stats = stats nm.groups = groups_obj except KillByUser: return except IOError as e: nm.exception = str(e) print(e) return except Exception as e: logging.exception("Unexpected exit in Orthology search") nm.exception = str(e)
def convert_group(sqldb, cds_file_list, protein_db, group_sequences, usearch_bin, output_dir, shared_namespace=None): """ Convenience function that wraps all required operations to convert protein to nucleotide files from a Group object """ if shared_namespace: shared_namespace.act = "Creating database" shared_namespace.missed = 0 shared_namespace.good = 0 # Create database id_db = create_db(cds_file_list, output_dir, shared_namespace) if shared_namespace: shared_namespace.act = "Creating query" # Kill switch if shared_namespace.stop: raise KillByUser("") # Create query for USEARCH group_sequences.retrieve_sequences(sqldb, protein_db, output_dir, outfile="query.fas", shared_namespace=shared_namespace) if shared_namespace: # Kill switch if shared_namespace.stop: raise KillByUser("") # Execute search if shared_namespace: shared_namespace.act = "Performing search" pair_search(usearch_bin, output_dir) if shared_namespace: # Kill switch if shared_namespace.stop: raise KillByUser("") pair_db = get_pairs(output_dir, ns=shared_namespace) # Convert files if shared_namespace: shared_namespace.act = "Converting to nucleotide" convert_protein_file(pair_db, group_sequences, id_db, output_dir, shared_namespace) # Remove temporary files temp_files = [ join(output_dir, "query.fas"), join(output_dir, "transcripts.fas"), join(output_dir, "pairs.out") ] for f in temp_files: os.remove(f)