def extract_comparisons_from_file(filename): comparisons = [] total = 0 with open(filename) as f: data = f.readlines() sequences = [] for row in data: if row[:4] == "lcl|": sequences.append({ "id": total, "values": [value.strip() for value in row.split(" ")] }) total += 1 log.datetime_log("Starting process for filename {}".format(filename)) with Pool(processes=10) as pool: comparisons = pool.map(partial(get_relevant_data, total=total), sequences) log.datetime_log("Finishing process for filename {}".format(filename)) return comparisons
def get_relevant_data(values, total): count = values["id"] values = values["values"] taxid = get_taxid_from_sequence(values[2]) organism_result = get_taxonomy_from_taxid(taxid) i = len(values) - 1 cont = 0 score = 0 while True: try: num = float(values[i]) cont += 1 if cont == 2: score = num break except ValueError: pass i -= 1 count += 1 organism_result["SCORE"] = num log.datetime_log( "Classified sequence with id.{} out of {} sequences.".format( count, total)) return organism_result
def post_prune_trees(): output = {} try: data = request.get_json() merged_tree = json.loads(data['mergedTree']) threshold = float(data['threshold']) sequences = list(merged_tree['SCORE'].keys()) saved_sequences, rest = utils.get_unsaved_sequences(sequences) pruned_sequences = [] for sequence in saved_sequences: pruned_sequence = {} pruned_sequence['sequence_id'] = sequence['sequence_id'] pruned_sequence['hierarchy'] = utils.prune_tree( threshold, sequence['hierarchy']) pruned_sequences.append(pruned_sequence) pruned_tree = utils.prune_tree(threshold, merged_tree) output['pruned_sequences'] = pruned_sequences output['pruned_tree'] = pruned_tree return jsonify(output) except Exception as e: output["Error"] = str(e) log.datetime_log("Error: {}".format(e)) return jsonify(output)
def get_taxonomy_from_taxid(taxid): taxonomy_dict = NONE_RANK try: rank, tax_name, parent_taxid = get_rank_from_taxid(taxid) while parent_taxid != 1: if rank != "NO RANK": taxonomy_dict[rank] = tax_name rank, tax_name, parent_taxid = get_rank_from_taxid(parent_taxid) # Check if it has the minimum rankings for min_rank in MINIMUM_RANKS: if not min_rank in taxonomy_dict.keys(): possible_ranks = [ rank for rank in taxonomy_dict.keys() if min_rank in rank ] if len(possible_ranks) > 0: taxonomy_dict[min_rank] = taxonomy_dict[possible_ranks[0]] else: taxonomy_dict[min_rank] = "undefined" return taxonomy_dict except: log.datetime_log("Not able to find rank of taxid {}".format(taxid)) return NONE_RANK
def parse_names_file(filename, **kargs): batch_size = 1 processes = 1 if "batch_size" in kargs: batch_size = kargs["batch_size"] if "processes" in kargs: processes = kargs["processes"] with open(filename, "r") as f: for i, data in enumerate( iter(lambda: tuple(islice(f, batch_size)), ())): log.datetime_log("Started batch no. {} name parser".format(i)) with Pool(processes=processes) as pool: filtered_data = [get_new_values(row) for row in data if "name" in get_new_values(row)] pool.map( update_one, filtered_data)
def remove_files(folder, processed_file): with open(processed_file, 'r') as existing_files: count_removed = 0 with open(INPUT_FILE, 'w') as input_file: for line in existing_files: info_existing = line.split(":") existing_filename = info_existing[2].replace('"', "").replace( "{", "").replace("}", "").strip(" \t\n\r") input_file.write("{},\n".format(existing_filename)) existing_filename += ".out.txt" existing_path = os.path.join(folder, existing_filename) if os.path.isfile(existing_path): count_removed += 1 os.remove(existing_path) log.datetime_log("Removed {} files out of {} saved models".format( count_removed, len(existing_files.readlines())))
def post_prune_single_tree(): output = {} try: data = request.get_json() tree = json.loads(data['tree']) threshold = float(data['threshold']) pruned_tree = utils.prune_tree(threshold, tree) output['pruned_tree'] = pruned_tree return jsonify(output) except Exception as e: output["Error"] = str(e) log.datetime_log("Error: {}".format(e)) return jsonify(output)
def parse_sequences(TMP_FOLDER, filename, **kargs): batch_size = 1 processes = 1 if "batch_size" in kargs: batch_size = kargs["batch_size"] if "processes" in kargs: processes = kargs["processes"] with open(filename, "r") as f: output_paths = [] for i, data in enumerate(iter(lambda: tuple(islice(f, batch_size)), ())): log.datetime_log("Started batch no. {} name parser".format(i)) with Pool(processes=processes) as pool: filtered_data = [row for row in data if not row[:1] == ">"] output_paths.extend( pool.map(partial(compare_sequence, TMP_FOLDER=TMP_FOLDER), filtered_data))
def generate_and_update_hierarchies(row): sequence_id = row.strip(" \t\n\r").replace(",", "") with MongoClient() as client: db = client.biovis db_models = db.models search = {"sequence_id": sequence_id} saved = db_models.find_one(search) if (saved is not None and saved["comparisons"] is not None): tmp_tree, tmp_hierarchy = utils.get_hierarchy_from_dict( sequence_id, saved["comparisons"]) update = {"hierarchy": tmp_hierarchy, "tree": tmp_tree} db_models.update_one(search, {"$set": update}) log.datetime_log("Updated document {}".format(sequence_id))
def post_compare_sequence(): output = {} try: merged_tree = {'name': '', 'children': {}, 'SCORE': []} data = request.get_json() if not "batch_size" in data: data["batch_size"] = 1 data["sequences"] = [ sequence.strip(" \t\n\r") for sequence in data["sequences"] ] # Detect sequences processed before saved_sequences, tmp_sequences = utils.get_unsaved_sequences( data["sequences"]) # Include previously saved sequences processed_batch = saved_sequences.copy() for saved_sequence in processed_batch: utils.get_hierarchy_from_dict(saved_sequence['sequence_id'], saved_sequence['comparisons'], target=merged_tree) counter = 0 current_batch_stop = counter pieces_left = len(tmp_sequences) > 0 while pieces_left: tmp_sequences = tmp_sequences[current_batch_stop:] num_sequences_left = len(tmp_sequences) if data["batch_size"] < num_sequences_left: current_batch_stop = data["batch_size"] else: current_batch_stop = num_sequences_left pieces_left = False # Compare unprocessed sequences file_batch = [ utils.compare_sequence(sequence) for sequence in tmp_sequences ] counter += data["batch_size"] log.datetime_log("{} sequences compared.".format(counter)) # Generate tree for unprocessed sequences merged_tree, unsaved_batch = utils.process_batch( tmp_sequences, file_batch, merged_tree) processed_batch.extend(unsaved_batch) # Prepare output hierarchy, aggregated_score = utils.form_hierarchy(merged_tree) output["merged_tree"] = hierarchy output["taxonomies_batch"] = processed_batch log.datetime_log("{} hierarchies formed.".format(counter)) return jsonify(output) except Exception as e: output["Error"] = str(e) log.datetime_log("Error: {}".format(e)) return jsonify(output)
def upload_file(): output = {} try: data = request.get_json() if data["file"] is not None and data["filename"] is not None: taxonomy = [] parsed_filename = data["filename"].split(".")[0] merged_tree = {'name': '', 'children': {}, 'SCORE': []} try: file_path = utils.try_to_save_file(data["file"], data["filename"]) log.datetime_log("Succeded saving file.") merged_tree, taxonomy = utils.process_batch([parsed_filename], [file_path], merged_tree) except utils.FileExists as e: taxonomy, tmp_sequences = utils.get_unsaved_sequences( [parsed_filename]) if len(taxonomy) == 0: sequence_id = utils.get_sequence_id(data["filename"]) if sequence_id is not None: log.datetime_log( "File existed and sequence {} parsed succesfully.". format(sequence_id)) taxonomy, tmp_sequences = utils.get_unsaved_sequences( [sequence_id]) if len(taxonomy) > 0: utils.get_hierarchy_from_dict(taxonomy[0]['sequence_id'], taxonomy[0]['comparisons'], target=merged_tree) else: log.datetime_log( "File existed but sequence not parsed: trying to write a new file." ) file_path = "" cont = 0 while len(file_path) == 0 and cont < 50: try: file_path = utils.try_to_save_file( data["file"], data["filename"], modifier=cont) except utils.FileExists as e: cont += 1 log.datetime_log( "File succesfully saved at {}.".format(file_path)) merged_tree, taxonomy = utils.process_batch( [parsed_filename], [file_path], merged_tree) # Prepare output print("merged_tree") print(merged_tree) hierarchy, aggregated_score = utils.form_hierarchy(merged_tree) print("hierarchy") print(hierarchy) output["merged_tree"] = hierarchy['children'][0] output["taxonomies_batch"] = taxonomy return jsonify(output) except Exception as e: output["Error"] = str(e) log.datetime_log("Error: {}".format(e)) return jsonify(output)
def read_in_chunks(file, batch_size): log.datetime_log("Started batch node parser") data = list(map(lambda x: parse_tax(x), islice(file, batch_size))) yield data