예제 #1
0
def find_ncbi_ids_from_gene2ensembl(ensembl_dict, gene2ensembl_file):
    """Input is gene2ensembl_file; maps NCBI gene ID to one Ensembl gene ID.

    'gene2ensembl' (useful columns in input file):

    col1: NCBI gene ID
    col2: Ensembl gene ID
    """
    print("step 3 start: find NCBI IDs from gene2ensembl file")
    file_in = anyfile(gene2ensembl_file)
    #with open(gene2ensembl_file) as file_in:
    try:
        next(file_in)
        for line in file_in:
            split_line = line.split("\t")
            ensembl_gene_id_from_gene2ensembl = split_line[2].strip()
            ncbi_gene_id_from_gene2ensembl = split_line[1].strip()

            if ensembl_gene_id_from_gene2ensembl in ensembl_dict:
                ensembl_dict[ensembl_gene_id_from_gene2ensembl]['data'][
                    'gene2ensembl'].append(ncbi_gene_id_from_gene2ensembl)
    finally:
        file_in.close()

    count = 0
    for key in ensembl_dict:
        if len(ensembl_dict[key]['data']['gene2ensembl']) == 1:
            count += 1

    print(
        "Total number of Ensembl gene IDs mapping uniquely with gene2ensembl: ",
        count)
    print("step 3 end")
    return ensembl_dict, count
예제 #2
0
def data_generator(input_file, version):
    with anyfile(input_file) as file:
        file_reader = csv.reader(file, delimiter="\t")

        header = next(file_reader)
        assert len(header) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(header))

        previous_row = None
        for row in file_reader:
            row = dict(zip(header, row))

            # use transposed matrix to have 1 line with N 187 columns
            current_row = DbnsfpReader.map_row_to_json(row, version=version)
            if previous_row and current_row:
                if current_row["_id"] == previous_row["_id"]:
                    aa = previous_row["dbnsfp"]["aa"]
                    if not isinstance(aa, list):
                        aa = [aa]
                    aa.append(current_row["dbnsfp"]["aa"])
                    previous_row["dbnsfp"]["aa"] = aa
                    if len(previous_row["dbnsfp"]["aa"]) > 1:
                        continue
                else:
                    yield previous_row

            previous_row = current_row

        if previous_row:
            yield previous_row
예제 #3
0
def data_generator(input_file, version, include_gnomad):
    open_file = anyfile(input_file)
    db_nsfp = csv.reader(open_file, delimiter="\t")
    index = next(db_nsfp)
    assert len(
        index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (
            VALID_COLUMN_NO, len(index))
    previous_row = None
    for row in db_nsfp:
        df = dict(zip(index, row))
        # use transpose matrix to have 1 row with N 187 columns
        current_row = _map_line_to_json(df,
                                        version=version,
                                        include_gnomad=include_gnomad)
        if previous_row and current_row:
            if current_row["_id"] == previous_row["_id"]:
                aa = previous_row["dbnsfp"]["aa"]
                if not isinstance(aa, list):
                    aa = [aa]
                aa.append(current_row["dbnsfp"]["aa"])
                previous_row["dbnsfp"]["aa"] = aa
                if len(previous_row["dbnsfp"]["aa"]) > 1:
                    continue
            else:
                yield previous_row
        previous_row = current_row
    if previous_row:
        yield previous_row
예제 #4
0
 def __init__(self, infile):
     self.infile = infile
     self.in_f = anyfile(self.infile)
예제 #5
0
def export_ids(col_name):
    """
    Export all _ids from collection named col_name.
    If col_name refers to a build where a cold_collection is defined,
    will also extract _ids and sort/uniq them to have the full list of _ids
    of the actual merged (cold+hot) collection
    Output file is stored in DATA_EXPORT_FOLDER/ids,
    defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is
    returned as the end, if successful.
    """
    # prepare output directory
    DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None)
    if not DATA_EXPORT_FOLDER:
        DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export")
    ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids")
    if not os.path.exists(ids_export_folder):
        logging.debug("Creating export/ids folder: %s" % ids_export_folder)
        os.makedirs(ids_export_folder)
    build = get_src_build().find_one({"_id":col_name})
    cold = None
    if build:
        col = get_target_db()[col_name]
        if build.get("build_config",{}).get("cold_collection"):
            cold_name = build["build_config"]["cold_collection"]
            cold = get_target_db()[cold_name]
            logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name))
    else:
        # it's a src
        col = get_src_db()[col_name]
    
    # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore,
    # so we're sure to work with latest data. If cache is valid, this will be pretty fast
    logging.info("Screening _ids in collection '%s'" % col.name)
    for _id in id_feeder(col,validate_only=True):
        pass
    # now accessing cache
    col_ids_cache = get_cache_filename(col.name)
    assert os.path.exists(col_ids_cache)
    logging.info("Now using cache file %s" % col_ids_cache)
    if cold:
        logging.info("Screening _ids in cold collection '%s'" % cold.name)
        for _id in id_feeder(cold,validate_only=True):
            pass
        # now accessing cache
        cold_ids_cache = get_cache_filename(cold.name)
        assert os.path.exists(cold_ids_cache)
        logging.info("Now using cache file %s" % cold_ids_cache)
    outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name)
    # NOTE: can't use anyfile to open cache files and send _id through pipes
    # because it would load _id in memory (unless using hacks) so use cat (and
    # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe
    # on the shell
    if cold:
        fout = anyfile(outfn,"wb")
        colext = os.path.splitext(col_ids_cache)[1]
        coldext = os.path.splitext(cold_ids_cache)[1]
        assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext)
        comp = colext.replace(".","")
        supportedcomps = ["xz","gz",""] # no compression allowed as well
        assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps)
        # IDs sent to pipe's input (sort) then compress it (xz)
        pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE)
        psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True)
        pcat.stdout.close() # will raise end of pipe error when finished
        if comp:
            pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout)
        else:
            # just print stdin to stdout
            pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout)
        psort.stdout.close()
        try:
            logging.info("Running pipe to compute list of unique _ids")
            (out,err) = pcomp.communicate() # run the pipe! (blocking)
            if err:
                raise Exception(err)
        except Exception as e:
            logging.error("Error while running pipe to export _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise
    else:
        logging.info("Copying cache _id file")
        try:
            shutil.copyfile(col_ids_cache,outfn)
        except Exception as e:
            logging.error("Error while exporting _ids: %s" % e)
            # make sure to clean empty or half processed files
            try:
                os.unlink(outfn)
            finally:
                pass
            raise

    logging.info("Done exporting _ids to '%s'" % outfn)
    return outfn