def prepare_data(self): self.initial_setup(self.output_folder) data_folders = self.extract_data_folders() for data_folder in data_folders: data_folder_path, data_folder_name = data_folder self.data_to_lmdb(data_folder_name, data_folder_path) data_encoder = DataEncoder(data_folder_path, self.output_folder, self.threads, self.language, data_folder_name) data_encoder.encode_data() self.make_fasta_file(data_folder_name) self.clean_encoded() self.logger.info("Added DB: {} \t {} new texts".format( data_folder_name, self.text_counts[-1][1])) self.save_text_counts() self.make_db()
class DataPreparer: def __init__(self, data_location, output_folder, threads, language, split_size, logger): self.data_location = data_location self.output_folder = output_folder self.threads = threads self.language = language self.data_encoder = DataEncoder(data_location, output_folder, threads, language) self.split_size = split_size self.logger = logger def make_directory(self, where): if not os.path.exists(where): os.makedirs(where) def prepare_data(self): self.initial_setup(self.output_folder) self.data_to_lmdb() self.logger.info("Encoding data to proteins...") self.data_encoder.encode_data() self.generate_db() def split_text_into_blocks(self, block): if self.split_size > 0: for i in range(0, len(block["text"]), self.split_size): yield block["text"][i:i + self.split_size], str( block["doc_id"]) + "__{}_{}".format( i, i + self.split_size) else: yield block["text"], str(block["doc_id"]) ## Generate a LMDB DB for the original data. Helps in reconstructing phase, this is done with just ONE thread :c def data_to_lmdb(self): self.logger.info("Loading original data into databases...") text_db, info_db = self.open_databases() files, folder = self.get_data_files() with text_db.begin(write=True) as t_db, info_db.begin( write=True) as i_db: for filename in files: with gzip.open(folder + "/" + filename, "rt") as data_file: for line in data_file: if not line: continue block = json.loads(line.strip()) if len(block["text"]) == 0: continue for block_i, split_block in enumerate( self.split_text_into_blocks(block)): text_block, new_id = split_block block_data = {} for k, v in block.items(): block_data[k] = v block_data["text"] = text_block ## Only ASCII in doc_id! BLAST doesn't like non ascii characters in fsa doc_id = new_id.encode("ascii", errors="ignore") t_db.put( doc_id, block_data["text"].encode("unicode-escape")) del block_data["text"], block_data["doc_id"] i_db.put( doc_id, json.dumps(block_data).encode( "unicode-escape")) ##TODO BUT DOES IT WORK? def get_data_files(self): if os.path.isdir(self.data_location): files = os.listdir(self.data_location) folder = self.data_location else: files = [self.data_location.split("/")[1]] folder = self.data_location.split("/")[0] return files, folder def open_databases(self): text_env = lmdb.open(self.output_folder + "/db/original_data_DB", map_size=50000000000) info_env = lmdb.open(self.output_folder + "/db/info_DB", map_size=5000000000) return text_env, info_env ## Generate the protein database for BLAST def generate_db(self): self.logger.info("Generating protein database..") self.make_fasta_file() self.make_db() def make_fasta_file(self): encoded_db = lmdb.open(self.output_folder + "/db/encoded_data_DB", readonly=True) gi = 1 with encoded_db.begin() as db: with open(self.output_folder + "/db/database.fsa", "w") as fasta_file: for key, value in db.cursor(): doc_id = key.decode("utf-8") text = value.decode("unicode-escape") begin = ">gi|{} {}".format(gi, doc_id) fasta_file.write("{}\n{}\n".format(begin, text)) gi += 1 self.text_count = gi - 1 def get_text_count(self): return self.text_count ## Make the DB using makeblastdb def make_db(self): subprocess.call( "makeblastdb -in {} -dbtype prot -title TextDB -parse_seqids -hash_index -out {}" .format(self.output_folder + "/db/database.fsa", self.output_folder + "/db/textdb").split(" ")) self.db_loc = self.output_folder + "/db/textdb" ## Make intial folders for later def initial_setup(self, where): self.logger.info("Performing initial setups...") self.make_directory(where) for location in [ "encoded", "db", "subgraphs", "info", "batches", "clusters", "clusters/unfilled", "clusters/filled" ]: self.make_directory(where + "/" + location)