def find_ncbi_ids_from_gene2ensembl(ensembl_dict, gene2ensembl_file): """Input is gene2ensembl_file; maps NCBI gene ID to one Ensembl gene ID. 'gene2ensembl' (useful columns in input file): col1: NCBI gene ID col2: Ensembl gene ID """ print("step 3 start: find NCBI IDs from gene2ensembl file") file_in = anyfile(gene2ensembl_file) #with open(gene2ensembl_file) as file_in: try: next(file_in) for line in file_in: split_line = line.split("\t") ensembl_gene_id_from_gene2ensembl = split_line[2].strip() ncbi_gene_id_from_gene2ensembl = split_line[1].strip() if ensembl_gene_id_from_gene2ensembl in ensembl_dict: ensembl_dict[ensembl_gene_id_from_gene2ensembl]['data'][ 'gene2ensembl'].append(ncbi_gene_id_from_gene2ensembl) finally: file_in.close() count = 0 for key in ensembl_dict: if len(ensembl_dict[key]['data']['gene2ensembl']) == 1: count += 1 print( "Total number of Ensembl gene IDs mapping uniquely with gene2ensembl: ", count) print("step 3 end") return ensembl_dict, count
def data_generator(input_file, version): with anyfile(input_file) as file: file_reader = csv.reader(file, delimiter="\t") header = next(file_reader) assert len(header) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(header)) previous_row = None for row in file_reader: row = dict(zip(header, row)) # use transposed matrix to have 1 line with N 187 columns current_row = DbnsfpReader.map_row_to_json(row, version=version) if previous_row and current_row: if current_row["_id"] == previous_row["_id"]: aa = previous_row["dbnsfp"]["aa"] if not isinstance(aa, list): aa = [aa] aa.append(current_row["dbnsfp"]["aa"]) previous_row["dbnsfp"]["aa"] = aa if len(previous_row["dbnsfp"]["aa"]) > 1: continue else: yield previous_row previous_row = current_row if previous_row: yield previous_row
def data_generator(input_file, version, include_gnomad): open_file = anyfile(input_file) db_nsfp = csv.reader(open_file, delimiter="\t") index = next(db_nsfp) assert len( index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % ( VALID_COLUMN_NO, len(index)) previous_row = None for row in db_nsfp: df = dict(zip(index, row)) # use transpose matrix to have 1 row with N 187 columns current_row = _map_line_to_json(df, version=version, include_gnomad=include_gnomad) if previous_row and current_row: if current_row["_id"] == previous_row["_id"]: aa = previous_row["dbnsfp"]["aa"] if not isinstance(aa, list): aa = [aa] aa.append(current_row["dbnsfp"]["aa"]) previous_row["dbnsfp"]["aa"] = aa if len(previous_row["dbnsfp"]["aa"]) > 1: continue else: yield previous_row previous_row = current_row if previous_row: yield previous_row
def __init__(self, infile): self.infile = infile self.in_f = anyfile(self.infile)
def export_ids(col_name): """ Export all _ids from collection named col_name. If col_name refers to a build where a cold_collection is defined, will also extract _ids and sort/uniq them to have the full list of _ids of the actual merged (cold+hot) collection Output file is stored in DATA_EXPORT_FOLDER/ids, defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is returned as the end, if successful. """ # prepare output directory DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None) if not DATA_EXPORT_FOLDER: DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export") ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids") if not os.path.exists(ids_export_folder): logging.debug("Creating export/ids folder: %s" % ids_export_folder) os.makedirs(ids_export_folder) build = get_src_build().find_one({"_id":col_name}) cold = None if build: col = get_target_db()[col_name] if build.get("build_config",{}).get("cold_collection"): cold_name = build["build_config"]["cold_collection"] cold = get_target_db()[cold_name] logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name)) else: # it's a src col = get_src_db()[col_name] # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore, # so we're sure to work with latest data. If cache is valid, this will be pretty fast logging.info("Screening _ids in collection '%s'" % col.name) for _id in id_feeder(col,validate_only=True): pass # now accessing cache col_ids_cache = get_cache_filename(col.name) assert os.path.exists(col_ids_cache) logging.info("Now using cache file %s" % col_ids_cache) if cold: logging.info("Screening _ids in cold collection '%s'" % cold.name) for _id in id_feeder(cold,validate_only=True): pass # now accessing cache cold_ids_cache = get_cache_filename(cold.name) assert os.path.exists(cold_ids_cache) logging.info("Now using cache file %s" % cold_ids_cache) outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name) # NOTE: can't use anyfile to open cache files and send _id through pipes # because it would load _id in memory (unless using hacks) so use cat (and # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe # on the shell if cold: fout = anyfile(outfn,"wb") colext = os.path.splitext(col_ids_cache)[1] coldext = os.path.splitext(cold_ids_cache)[1] assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext) comp = colext.replace(".","") supportedcomps = ["xz","gz",""] # no compression allowed as well assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps) # IDs sent to pipe's input (sort) then compress it (xz) pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE) psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True) pcat.stdout.close() # will raise end of pipe error when finished if comp: pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout) else: # just print stdin to stdout pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout) psort.stdout.close() try: logging.info("Running pipe to compute list of unique _ids") (out,err) = pcomp.communicate() # run the pipe! (blocking) if err: raise Exception(err) except Exception as e: logging.error("Error while running pipe to export _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise else: logging.info("Copying cache _id file") try: shutil.copyfile(col_ids_cache,outfn) except Exception as e: logging.error("Error while exporting _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise logging.info("Done exporting _ids to '%s'" % outfn) return outfn