def extract_hamming_distance_feature(
    path_ribo_kmers=home_path +
    "/rrna/data/featureExtractors/ribo_kmers_from_genes_50_int.npy",
    path_ordered_gene_names=home_path +
    "/rrna/data/featureExtractors/Homo_sapiens.GRCh38.84_ordered_gene_names.npy"
):
    fe = featureExtraction.FeatureExtractor()

    if not fe.load_gene_features():
        fe.generate_gene_features(reload=True)

    fe.generate_chunked_sequences()

    nn = sklearn.neighbors.NearestNeighbors(n_neighbors=1,
                                            metric="hamming",
                                            n_jobs=30)
    nn.fit(np.load(path_ribo_kmers))

    gene_names = np.load(path_ordered_gene_names)

    save_path = home_path + "/rrna/data/featureExtractors/distance_features.pkl"
    if os.path.exists(save_path):
        with open(save_path, "r") as f:
            distances = pkl.load(f)
    else:
        distances = {}
    mapper = {"A": 1, "C": 2, "G": 3, "T": 4, "N": 5}

    time_start = time.time()
    for igf_key in range(len(gene_names)):
        gf_key = gene_names[igf_key]
        print "%d of %d, time: %.3f" % \
              (igf_key, len(gene_names), time.time() - time_start)

        if not gf_key in distances and gf_key in fe.genefeatures:
            print "next length: %d" % (len(fe.genefeatures[gf_key].chunks))
            chunks = [
                np.array([mapper[nuc] for nuc in seq], dtype=np.int)
                for seq in fe.genefeatures[gf_key].chunks
            ]
            distances[gf_key] = nn.kneighbors(chunks)[0]

            if igf_key % 10 == 0:
                with open(save_path, "w") as f:
                    pkl.dump(distances, f)
def _extract_expression_features_thread(args):
    gene_names = args[0]
    bam_path = args[1]
    save_path = args[2]

    fe = featureExtraction.FeatureExtractor()

    if not fe.load_gene_features():
        fe.generate_gene_features(reload=True)

    fe.generate_chunked_sequences()

    if os.path.exists(save_path):
        with open(save_path, "r") as f:
            coverage = pkl.load(f)
    else:
        coverage = {}

    pfile = pysam.AlignmentFile(bam_path, "rb")

    time_start = time.time()
    for igf_key in range(len(gene_names)):
        print igf_key, time.time() - time_start
        gf_key = gene_names[igf_key]
        if not gf_key in coverage and gf_key in fe.genefeatures:
            if len(fe.genefeatures[gf_key].chunks) > 0:
                fe.genefeatures[gf_key].calculate_chunkwise_coverage(pfile)
                # print fe.genefeatures[gf_key].chunk_coverage
                try:
                    mean = np.mean(fe.genefeatures[gf_key].chunk_coverage,
                                   axis=1)
                    std = np.std(fe.genefeatures[gf_key].chunk_coverage,
                                 axis=1)
                    gap = np.max(fe.genefeatures[gf_key].chunk_coverage, axis=1) - \
                          np.min(fe.genefeatures[gf_key].chunk_coverage, axis=1)
                except IndexError:
                    mean = np.zeros(len(fe.genefeatures[gf_key].chunks))
                    std = np.zeros(len(fe.genefeatures[gf_key].chunks))
                    gap = np.zeros(len(fe.genefeatures[gf_key].chunks))
                coverage[gf_key] = np.vstack([mean.T, std.T, gap.T]).T

            if igf_key % 10 == 0:
                with open(save_path, "w") as f:
                    pkl.dump(coverage, f)
def extract_gene_kmers(
    bam_path,
    k=18,
    n_worker=10,
    gtf_path="/homes/gws/sdorkenw/rrna/data/annotations/Homo_sapiens.GRCh38.84.gtf",
    fasta_file_path="/homes/gws/sdorkenw/reference_genome_38/GRCh38_o.p3.genome.fa",
    save_path="/homes/gws/sdorkenw/rrna/data/featureExtractors/gene_features.pkl"
):
    # fe = featureExtraction.FeatureExtractor(fasta_file_path, gtf_path,
    #                                         save_path)
    fe = featureExtraction.FeatureExtractor()

    if not fe.load_gene_features():
        fe.generate_gene_features(reload=True)

    fe.generate_chunked_sequences()

    # pfile = pysam.AlignmentFile(bam_path, "rb")
    kmers_cov = {}

    multi_params = []
    for gf_key in fe.genefeatures.keys():
        multi_params.append([bam_path, k, fe.genefeatures[gf_key]])

    if n_worker > 1:
        pool = Pool(n_worker)
        results = pool.map_async(_extract_gene_kmers_thread, multi_params)
        pool.close()  # No more work
        while True:
            if results.ready():
                break
            remaining = results._number_left
            print "Waiting for", remaining, "tasks to complete..."
            time.sleep(10)
        pool.join()
        results = results.get()
    else:
        results = map(_extract_gene_kmers_thread, multi_params)

    for result in results:
        kmers_cov[result[0]] = result[1]

    return kmers_cov
def extract_ribo_kmers(
    k=18,
    gtf_path="/homes/gws/sdorkenw/rrna/data/ref_genomes/rrna_hg38.gtf",
    fasta_file_path="/homes/gws/sdorkenw/reference_genome_38/GRCh38_o.p3.genome.fa",
    save_path="/homes/gws/sdorkenw/rrna/data/featureExtractors/ribo_features.pkl"
):
    # fe = featureExtraction.FeatureExtractor(fasta_file_path, gtf_path,
    #                                         save_path)
    fe = featureExtraction.FeatureExtractor()

    if not fe.load_gene_features():
        fe.generate_gene_features(reload=True)

    fe.generate_chunked_sequences()

    kmers = set()
    for gf_key in fe.genefeatures.keys():
        print gf_key
        gf = fe.genefeatures[gf_key]
        kmers = kmers.union(set(gf.calculate_kmers(k=k)))

    return kmers
def extract_kmers_from_ribo_genes(
        k=50,
        ribo_gene_name_path="/homes/gws/sdorkenw/rrna/data/annotations/Homo_sapiens.GRCh38.84_rrna_gene_names.npy",
        as_integers=True):
    ribo_names = np.load(ribo_gene_name_path)
    ribo_kmers = set()

    fe = featureExtraction.FeatureExtractor()

    if not fe.load_gene_features():
        fe.generate_gene_features(reload=True)

    fe.generate_chunked_sequences()

    for gf_key in ribo_names:
        print gf_key
        if gf_key in fe.genefeatures:
            gf = fe.genefeatures[gf_key]
            ribo_kmers = ribo_kmers.union(set(gf.calculate_kmers(k=k)))

    np.save(
        home_path +
        "/rrna/data/featureExtractors/ribo_kmers_from_genes_%d.npy" % k,
        list(ribo_kmers))

    if as_integers:
        mapper = {"A": 1, "C": 2, "G": 3, "T": 4}
        ribo_kmers_integers = np.zeros([len(ribo_kmers), k])
        i_kmer = 0
        for kmer in ribo_kmers:
            print "%d of %d" % (i_kmer, len(ribo_kmers))
            ribo_kmers_integers[i_kmer] = np.array(
                [mapper[nuc] for nuc in kmer], dtype=np.int)
            i_kmer += 1

        np.save(
            home_path +
            "/rrna/data/featureExtractors/ribo_kmers_from_genes_%d_int.npy" %
            k, ribo_kmers_integers)
# coding: utf-8
get_ipython().magic(u'cd ~/rrna/src')
import featureExtraction as featex
import generate_rnafold_features as rnafold

# rRNA
fe = featex.FeatureExtractor(
    fasta_file_path=
    "/projects/bio/rrna/data/ref_genomes/mouse/Mus_musculus.GRCm38.dna.primary_assembly.fa",
    gtf_file_path=
    "/projects/bio/rrna/data/annotations/mouse/gtf_subsets/Mus_musculus.GRCm38.85.rrna.gtf",
    save_path=
    "/projects/bio/rrna/data/featureExtractors/mouse/Mus_musculus.GRCm38.85_gene_features.pkl",
    chunk_save_path=
    "/projects/bio/rrna/data/featureExtractors/mouse/Mus_musculus.GRCm38.85_gene_chunks.pkl"
)
fe.generate_gene_features()
fe.generate_chunked_sequences()
results = rnafold.predict_secondary_for_chunks_parallel(
    fe,
    "rnafold_mouse_rrna",
    gene_list=
    "/projects/bio/rrna/data/annotations/mouse/gtf_subsets/Mus_musculus.GRCm38.85.rrna.names.npy",
    n_processes=40)

# all but rRNA
fe = featex.FeatureExtractor(
    fasta_file_path=
    "/projects/bio/rrna/data/ref_genomes/mouse/Mus_musculus.GRCm38.dna.primary_assembly.fa",
    gtf_file_path=
    "/projects/bio/rrna/data/annotations/mouse/gtf_subsets/Mus_musculus.GRCm38.85.no_rrna.gtf",