Beispiel #1
0
def load_transcript_strands(loc):
    transcript_strand = {}
    filename = loc + 'transcript.txt.gz'
    with fast_gzip_read(filename) as f:
        for line in tqdm(f, total=count_lines(filename)):
            data = line.split('\t')
            transcript_strand[data[14]] = int(data[6])
    return transcript_strand
Beispiel #2
0
def load_chromosome_and_region_names(loc):
    seq_region = {}
    # seq_region_id, name, cord_system_fk
    filename = loc + 'seq_region.txt.gz'
    with fast_gzip_read(filename) as f:
        for line in tqdm(f, total=count_lines(filename)):
            data = line.split('\t')
            seq_region[int(data[0])] = data[1]
    return seq_region
Beispiel #3
0
def load_variation_sources(loc):
    sources = {}
    # 'source_id', 'name', 'version', 'description', 'url', 'type', 'somatic_status', 'data_types'
    filename = loc + 'source.txt.gz'
    with fast_gzip_read(filename) as f:
        for line in tqdm(f, total=count_lines(filename)):
            data = line.split('\t')
            sources[int(data[0])] = data[1]
    gc.collect()
    return sources
Beispiel #4
0
def get_cds_positions(transcripts):
    cds_positions = {}
    with fast_gzip_read('ucsc/ref_gene.tsv.gz') as f:
        header = next(f)
        # assert header == '#bin    name    chrom   strand  txStart txEnd   cdsStart        cdsEnd  exonCount       exonStarts      exonEnds        score   name2   cdsStartStat cdsEndStat       exonFrames'
        for line in f:
            data = line.split('\t')
            refseq = data[1]
            if refseq not in transcripts:
                continue
            start, end = map(int, data[6:7 + 1])
            cds_positions[refseq] = (start, end)
    return cds_positions
Beispiel #5
0
 def __init__(self, filename=None):
     data = defaultdict(list)
     if not filename:
         filename = self.filename
     if not filename:
         raise ValueError
     with fast_gzip_read(filename, processes=6) as f:
         header = next(f)
         assert header == '#hg19.knownToRefSeq.value	hg19.knownToEnsembl.value\n'
         for line in f:
             line = line.strip().split('\t')
             try:
                 ref_id, unknown_id = line
             except ValueError:
                 continue
             if unknown_id != 'n/a':
                 data[ref_id].append(unknown_id)
     self.data = data
Beispiel #6
0
def _get_all_zscores():
    zscores = []
    from multiprocess import fast_gzip_read

    print('Counting...')

    count = count_spidex()

    print('Loading...')

    with fast_gzip_read(SPIDEX_LOCATION) as f:
        header = next(f)
        get_dpsi_zscore = itemgetter(headers.index('dpsi_zscore'))
        for line in tqdm(f, total=count - 1):
            try:
                data = line.rstrip('\n').split('\t')
                # record = SpidexRecord(*data)
                # zscores.append(record.dpsi_zscore)
                zscores.append(float(get_dpsi_zscore(data)))
            except Exception as e:
                print(e)
                continue

    return zscores
def import_expressed_genes(bdb,
                           tissues=GTEX_TISSUES,
                           path=DEFAULT_PATH,
                           suffix=DEFAULT_GENE_SUFFIX):
    print('Importing expressed genes:')

    count = count_all(tissues, path, suffix)

    with tqdm(total=count) as progress:

        for tissue_name in tissues:
            file_name = tissue_name + suffix
            file_path = os.path.join(path, file_name)
            print('Loading', file_name)

            with fast_gzip_read(file_path) as file_object:
                # skip header
                next(file_object)

                for line in file_object:
                    data = line.split()
                    """
                    gene_id: (
                        gene_name,
                        gene_chr,
                        gene_start,
                        gene_end,
                        strand,
                    )
                    """
                    if not bdb[data[0]]:
                        bdb[data[0]].extend(data[1:6])
                    else:
                        assert bdb[data[0]] == data[1:6]

                    progress.update(1)
def iterate_over_expression(tissues_list=GTEX_TISSUES,
                            path=DEFAULT_PATH,
                            suffix=DEFAULT_SUFFIX):
    for tissue_name in tissues_list:
        file_name = tissue_name + suffix
        file_path = os.path.join(path, file_name)
        print('Loading', file_name)

        with fast_gzip_read(file_path, processes='all') as file_object:

            header_line = next(file_object)
            header = dict()

            for position, name in enumerate(header_line.split()):
                header[name] = position

            slope_pos = header['slope']
            gene_id_pos = header['gene_id']
            variant_id_pos = header['variant_id']

            for line in file_object:
                data = line.split()
                yield (data[variant_id_pos], tissue_name, data[slope_pos],
                       data[gene_id_pos])