Beispiel #1
0
def get_gse_dict(srr_file, sep=SEP):
    """读取gse:srr的字典"""

    print('读取SRR_INFO:%s' % srr_file)
    with open(srr_file, 'r', encoding='utf8') as f:
        srr_info = pd.read_csv(f, sep=sep)
    if srr_info.shape[0] == 0:
        print('%s:文件里什么信息都没有!' % srr_file)
        exit()
    gse_dict = {}
    srr_count = 0
    for group in srr_info.groupby('GSE'):
        gse = group[0]
        data = group[1]
        if len(set(data['organism'])) > 1:
            text = f'{gse}中的SRR属于多个物种,将不予处理!'
            print(add_color(text, 'yellow'))
        else:
            # 对每个GSE的SRR排下序
            srr_list = data['SRR'].tolist()
            srr_list.sort()
            gse_dict[gse] = srr_list
            srr_count += len(data['SRR'])
    print('读取到%d个GSE的%d个SRR信息' % (len(gse_dict), srr_count))
    if len(gse_dict) == 0:
        exit()

    return gse_dict
Beispiel #2
0
def get_gse_summary(gse_organism_dict, gse_data):
    """统计GSE处理结果,行是基因,列是样本"""

    print('开始统计GSE处理结果')
    gse_summary_dict = {}
    for gse in os.listdir(gse_data):
        gse_dir = os.path.join(gse_data, gse)
        gse_file = os.path.join(gse_dir, 'matrix.csv')
        gse_pca_file = os.path.join(gse_dir, 'pca.csv')
        if os.path.isdir(gse_dir):
            # ==========判断GSE的物种==========
            if gse in gse_organism_dict:
                organism = gse_organism_dict[gse]
            else:
                organism = 'None'
                print(add_color(f'不存在{gse}的物种信息', 'yellow'))
            # ==========计算GSE表达矩阵的基因数目和细胞数目==========
            if os.path.isfile(gse_file):
                with open(gse_file, 'r', encoding='utf8') as f:
                    samples = len(f.readline().strip().split(',')) - 1
                    genes = 0
                    for _ in f:
                        genes += 1
            else:
                samples = genes = 'None'
                print(add_color(f'不存在{gse_file}', 'yellow'))
            # ==========计算聚类数目==========
            if os.path.isfile(gse_pca_file):
                with open(gse_pca_file, 'r', encoding='utf8') as f:
                    gse_pca_data = pd.read_csv(f, index_col=0)
                    clusters = len(set(gse_pca_data['cluster'][1:]))
            else:
                clusters = 'None'
                print(add_color(f'不存在{gse_pca_file}', 'yellow'))
            gse_summary_dict[gse] = {
                'organism': organism,
                'samples': samples,
                'genes': genes,
                'clusters': clusters
            }

    print('统计到了%d个GSE处理的结果' % len(gse_summary_dict))
    gse_summary = pd.DataFrame(gse_summary_dict).reindex(
        ['organism', 'samples', 'genes', 'clusters']).transpose()
    gse_summary.index.name = 'GSE'

    return gse_summary
def generate_script(srr_data, output_file):
    """生成处理的脚本"""

    print('正在批量生成处理脚本')
    # 对每个worker生成一个脚本
    script_header = '#!/bin/bash\n'
    script_template = PYTHON_PATH + ' {worker_file} {task}'
    script_list = []
    script_path = os.path.abspath(os.path.join(srr_data, 'script'))
    distribution_path = os.path.abspath(os.path.join(srr_data, 'distribution'))
    if TIANHE:
        worker_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   'srr_worker_tianhe.py')
    else:
        worker_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   'srr_worker.py')
    for i in os.listdir(distribution_path):
        task = os.path.join(distribution_path, i)
        file_name = 'Run_' + i + '.sh'  # 一个sh脚本运行一个distribution里面的文件,即一个task
        file_path = os.path.join(script_path, file_name)
        with open(file_path, 'w') as f:
            f.write(script_header)
            script_body = script_template.replace('{worker_file}',
                                                  worker_file).replace(
                                                      '{task}', task)
            f.write(script_body)
        os.system('chmod 755 ' + file_path)
        script_list.append(file_path)
    text = '所有脚本生成完毕:%s' % os.path.abspath(os.path.join(srr_data, 'script'))
    print(add_color(text, 'green'))

    # 生成一个总脚本
    with open(output_file, 'w') as f:
        f.write(script_header)
        for script in script_list:
            if TIANHE:
                command = 'yhbatch -N 1 -n 1 -p %s %s' % (TIANHE_NODE_NAME,
                                                          script)
            else:
                command = script
            f.write(command + ' &\n')
    os.system('chmod 755 ' + output_file)
    text = '总脚本生成完毕:%s' % os.path.abspath(output_file)
    print(add_color(text, 'green'))
Beispiel #4
0
def integration_worker(sub_finished_srr, all_result_path):
    """对分配的SRR子任务进行读取"""

    pid = os.getpid()
    sub_gse_data_dict = {}
    for count, srr in enumerate(sub_finished_srr, 1):
        if count % 300 == 0 and PRINT_DETAIL:
            print(f'进程: {pid} 完成: {count}/{len(sub_finished_srr)}')
        try:
            with open(os.path.join(all_result_path, srr)) as f:
                # 读掉开始两行
                f.readline()
                f.readline()
                expression = [int(line.strip().split('\t')[-1]) for line in f]
                sub_gse_data_dict[srr] = expression
        except Exception as e:
            text = '处理错误: ' + srr
            print(add_color(text, 'red'))
            print(add_color(str(e), 'red'))
    return sub_gse_data_dict
Beispiel #5
0
def gse_handle(sub_gse_list, gse_data):
    """单个进程对传入的GSE列表进行处理"""

    pid = str(os.getpid())
    for count, gse in enumerate(sub_gse_list, 1):
        prefix = f'进程: {pid}\t任务: {count}/{len(sub_gse_list)}\t'
        gse_file = os.path.abspath(os.path.join(gse_data, gse, 'matrix.csv'))
        if not os.path.exists(gse_file):
            text = prefix + '文件不存在: ' + gse_file
            print(add_color(text, 'red'))

        else:
            file_size = '%.2fM' % (os.path.getsize(gse_file) / (10**6))
            text = prefix + '开始处理: ' + gse + '\t大小: ' + file_size
            print(add_color(text, 'yellow'))
            try:
                project = Project(gse_file)
                project.compute_all_steps()
                text = prefix + '处理完毕: ' + gse
                print(add_color(text, 'green'))
            except Exception as e:
                text = prefix + '处理错误: ' + gse
                print(add_color(text, 'red'))
                print(add_color(str(e), 'red'))
    text = f'进程: {pid}已结束所有任务!'
    print(add_color(text, 'green'))
def check_fist():
    """检查软件和数据是否正确"""

    print('正在检查软件和数据路径是否有误')
    all_right = True
    # 检查数据和软件
    for name, value in zip(
        ['GENOME', 'GENOME_INDEX', 'FASTERQ_DUMP', 'STAR', 'FEATURE_COUNTS'],
        [GENOME, GENOME_INDEX, FASTERQ_DUMP, STAR, FEATURE_COUNTS]):
        if not os.path.exists(value):
            print(add_color('×\t' + name, 'red'), end='\t')
            print(add_color('可修改:configure/' + name, 'red'))
            all_right = False
        else:
            print(add_color('√\t' + name, 'green'))

    # 单独检查python及版本
    foo = os.popen(PYTHON_PATH + ' -V').read().strip()
    if foo == '' or foo.split()[1].startswith('2'):
        print(add_color('×\tPYTHON_PATH', 'red'), end='\t')
        print(add_color('可修改:configure/PYTHON_PATH', 'red'))
        all_right = False
    else:
        print(add_color('√\tPYTHON_PATH', 'green'))

    if not all_right:
        exit()
Beispiel #7
0
def download_srr(srr_list, output_dir):
    """使用生产者消费者模型下载数据"""

    print('开始下载')
    make_dir(output_dir)  # 创建文件夹

    # 启动生产者
    queue_size = 100
    queue = Queue(queue_size)
    pro = Process(target=producer,
                  args=(srr_list, queue, PRINT_DETAIL, queue_size))
    pro.start()

    try:
        # 启动消费者队列
        consumer_list = []
        new_consumer = partial(consumer, path=output_dir)
        for consumer_name in range(1, N_DOWNLOAD + 1):
            con = Process(target=new_consumer,
                          args=(consumer_name, queue, PRINT_DETAIL, wget_srr))
            con.start()
            consumer_list.append(con)
        # 等待消费者执行完毕
        for con in consumer_list:
            con.join()
    except KeyboardInterrupt:
        print(add_color('如果是Windows下的话按两下Ctrl+C, 父进程和子进程全部马上结束', 'red'))
        print(add_color('如果是Linux下的话按两下Ctrl+C,等待子进程完成最后一个任务才会退出', 'red'))

    # 输出错误报告
    finished = {i.split('.')[0] for i in os.listdir(output_dir)}
    error = set(srr_list) - finished
    if error:
        print('%s个SRR没有下载' % add_color(len(error), 'red'))
        print(add_color(error, 'red'))
    else:
        print(add_color('所有SRR都被下载', 'green'))
    # 子进程必须要外部停止才能关掉
    print('按Ctrl+C退出')
Beispiel #8
0
def wget_srr(srr, print_detail, kwargs):
    """在命令行执行下载srr"""

    path = kwargs['path']
    # 如果当前目录已存在文件,则不用下载
    if os.path.exists(os.path.join(path, srr + '.sra')):
        return

    # 拼接形成下载srr的url
    url_split = [DOWNLOAD_PREFIX, srr[:6], srr, srr + '.sra']
    url = '/'.join(url_split)

    try:
        # -c是断点续传
        command_list = ['wget -c', str(url)]
        if not PRINT_WGET_DETAIL:
            command_list.append('-q')
        command_list.extend(['-P', str(path)])
        command = ' '.join(command_list)
        info = os.system(command)
        if print_detail:
            if info:
                text = add_color('× ' + url, 'red')
                # 删掉下载错误的文件
                remove_file(os.path.join(path, srr + '.sra'))
            else:
                text = add_color('√ ' + url, 'green')
            print(text)
    except KeyboardInterrupt:
        # 这条try-except只会在Windows下会执行
        # Windows下子进程先捕获到KeyboardInterrupt,然后传给父进程
        # 而Linux是父进程先捕获到KeyboardInterrupt,然后子进程变成孤儿进程继续运行
        # 我在这里用try-except专门处理Windows下的情况,在consumer函数中专门处理Linux的情况

        # 删掉中断下载时那个没下载完的文件
        remove_file(os.path.join(path, srr + '.sra'))
        raise KeyboardInterrupt
Beispiel #9
0
def final_work():
    """做一些事后处理工作"""

    # ====================从内存中释放物种的STAR_Index====================
    if cache_load:
        command_list = [
            star, '--genomeLoad', 'Remove', '--genomeDir',
            organism_genome_index
        ]
        command = ' '.join(command_list)
        os.system(command)
        text = 'release memory of %s Index' % organism
        print(add_color(text, 'yellow'))

    # ====================删除一些在当前目录下无缘无故出来的文件====================
    remove_file('./Aligned.out.sam')
    remove_file('./Log.out')
    remove_file('./Log.progress.out')
    os.system('rm -r _STARtmp')
Beispiel #10
0
def get_cell_marker_dict(cell_marker, sep):
    """获取含有cell_marker的数据,返回物种每种细胞类型的marker基因"""

    print('读取cell_marker:', cell_marker)
    with open(cell_marker, encoding='utf8') as f:
        cell_marker_data = pd.read_csv(f, sep=sep)

    columns = cell_marker_data.columns
    for col_name in ['speciesType', 'tissueType', 'cellName', 'ensemblID']:
        try:
            assert col_name in columns
        except AssertionError:
            text = 'cell_marker must have column: ' + col_name
            print(add_color(text, 'red'))
            exit()

    cell_marker_dict = {}
    for organism, data in cell_marker_data.groupby('speciesType'):
        organism_cell_dict = {}
        for _, cell in data.iterrows():
            cell_type = (cell['tissueType'], cell['cellName'])
            gene_set = set(cell['ensemblID'].strip().split(','))
            organism_cell_dict[cell_type] = gene_set
        cell_marker_dict[organism] = organism_cell_dict

    print('读取到下面物种的marker基因信息: \n%s' % set(cell_marker_dict.keys()))
    if len(cell_marker_dict) == 0:
        exit()

    # 计算每个物种所有marker gene
    sum_dict = {}
    for organism in cell_marker_dict:
        organism_all_gene_set = set()
        for cell_type, gene_set in cell_marker_dict[organism].items():
            organism_all_gene_set.update(gene_set)
        sum_dict[organism] = organism_all_gene_set
    cell_marker_dict['all'] = sum_dict

    return cell_marker_dict
Beispiel #11
0
def producer(task, queue, print_detail, queue_size):
    """将任务放进队列,在进程间共享"""

    targets = task[:]
    n_task = len(targets)
    n_put = -queue_size
    n_rest = queue_size + len(task)

    while True:
        n_rest -= 1
        n_put += 1
        if targets:
            queue.put(targets.pop())

        else:
            # 队列里的任务分发完了,就向队列中发送终止信号
            queue.put('close')

        # 输出已分发任务个数和已完成的任务
        if 0 < n_put <= n_task and print_detail:
            text = add_color('分发第%d个任务,剩余%d个任务' % (n_put, n_rest), 'green')
            print(text)
Beispiel #12
0
def srr_pool(srr_data):
    """将所有result文件下以SRR开头的文件移动到all_result_path中"""

    result_path = os.path.join(srr_data, 'result')
    all_result_path = os.path.join(result_path, 'all_result')
    make_dir(all_result_path)

    print('正在移动文件')
    for dir_path, _, _ in os.walk(result_path):
        abs_all_result_path = os.path.abspath(all_result_path)
        abs_dir_path = os.path.abspath(dir_path)
        if abs_dir_path != abs_all_result_path:
            file_list = [srr for srr in os.listdir(dir_path) if srr.startswith('SRR')]
            if len(file_list) > 0:
                print(f'{abs_dir_path}/*\t>\t{abs_all_result_path}/')
                command = f'mv {abs_dir_path}/* {abs_all_result_path}/'
                os.system(command)

    if len(os.listdir(all_result_path)) == 0:
        text = f'{result_path}中没有数据!'
        print(add_color(text, 'red'))
        exit()
Beispiel #13
0
def main_work(task_file):
    """处理一个任务列表"""

    first_work(task_file)
    # ====================开始处理task_list里面的所有任务====================
    # 开始处理task_list里面的所有任务
    error = []
    for count, srr in enumerate(task_list, 1):
        srr_file = os.path.join(data_path, srr)
        file_size = round(os.path.getsize(srr_file) / 10**9, 2)
        text = 'start to handle: %s  --  %s  size: %sG (%d/%d)' % (
            organism, srr, str(file_size), count, len(task_list))
        print(add_color(text, 'yellow'))

        foo, command = srr_handle(srr)
        if foo:
            text = 'error in %s --  %s\n command: %s' % (organism, srr,
                                                         command)
            print(add_color(text, 'red'))
            error.append(srr)
        else:
            text = 'success in : %s --  %s' % (organism, srr)
            print(add_color(text, 'green'))

        # 把中间产生的临时文件及时删了,防止影响下步处理
        if len(os.listdir(temp_path)) > 0:
            os.system('rm -r ' + os.path.join(temp_path, '*'))
    # ====================输出这个进程处理的报告====================
    text = 'task: %s finished' % task_file
    print(add_color(text, 'green'))

    if error:
        text = '%s do not handle correctly' % str(error)
        print(add_color(text, 'red'))
    else:
        text = 'all srr have finished correctly'
        print(add_color(text, 'green'))
    # ========================================
    final_work()
Beispiel #14
0
def gse_marker_handle(gse_data,
                      gse_organism_dict,
                      cell_marker_dict,
                      odds_ratio_threshold=2,
                      p_value_threshold=0.01,
                      method='greater'):
    """利用Fisher exact text检验为聚类结果标明类别"""

    assert method in {'two-sided', 'less', 'greater'}
    all_gse_data = os.listdir(gse_data)
    for count, gse in enumerate(all_gse_data, 1):
        print('========================================')
        gse_dir = os.path.join(gse_data, gse)
        marker_genes_file = os.path.join(gse_dir, 'marker_genes.csv')
        if os.path.isdir(gse_dir) and not os.path.isfile(marker_genes_file):
            # 存在文件夹,但文件夹里面没有matrix.csv会报错
            text = f'不存在{marker_genes_file}'
            print(add_color(text, 'red'))
        else:
            if gse not in gse_organism_dict:
                text = f'GSE_info中没有{gse}的物种信息!'
                print(add_color(text, 'red'))
                continue

            organism = gse_organism_dict[gse].replace(' ', '_')
            if organism not in cell_marker_dict:
                text = f'{gse}: cell_marker中没有{organism}的marker基因信息!'
                print(add_color(text, 'red'))
                continue

            text = f'正在处理: {gse} {organism} ({count}/{len(all_gse_data)})'
            print(add_color(text, 'yellow'))
            with open(marker_genes_file, 'r', encoding='utf8') as f:
                marker_genes_data = pd.read_csv(f, sep=',')

            item_list = []
            all_marker = cell_marker_dict['all'][organism]  # 总marker
            for cluster, data in marker_genes_data.groupby('cluster'):
                cluster_marker = set(
                    data['gene']) & all_marker  # 某个cluster的marker基因
                n_all_marker = len(all_marker)
                n_cluster_marker = len(cluster_marker)
                if n_cluster_marker == 0:
                    continue
                cluster_marker_prop = n_cluster_marker / n_all_marker  # cluster的marker占总marker的百分比
                for cell_type, cell_type_marker in cell_marker_dict[
                        organism].items():
                    n_cell_type_marker = len(
                        cell_type_marker)  # 某一cell_type的marker基因数目
                    # 随机情况下期望cluster的marker基因hit到cell_type的marker基因的数目
                    n_expected_hit = cluster_marker_prop * n_cell_type_marker
                    n_hit = len(cluster_marker & cell_type_marker)  # 实际hit到的数目
                    odds_ratio = n_hit / n_expected_hit  # 实际hit到的高于随机hit到的比例
                    if odds_ratio > odds_ratio_threshold:
                        # 构建列联表并进行Fisher exact test
                        n_non_hit_cell_type_marker = n_cell_type_marker - n_hit
                        n_non_hit_cluster_marker = n_cell_type_marker - n_hit
                        n_other_marker = n_all_marker - n_hit - n_non_hit_cell_type_marker - n_non_hit_cluster_marker
                        table = [[n_other_marker, n_non_hit_cell_type_marker],
                                 [n_non_hit_cluster_marker, n_hit]]
                        p_value = stats.fisher_exact(table, method)[1]
                        if p_value < p_value_threshold:
                            item = [
                                cluster, n_all_marker, n_cluster_marker,
                                n_cell_type_marker, n_hit, n_expected_hit,
                                odds_ratio, p_value, organism, cell_type[0],
                                cell_type[1]
                            ]
                            item_list.append(item)
            if item_list:
                item_data = pd.DataFrame(item_list)
                item_data.columns = [
                    'cluster', 'n_all_marker', 'n_cluster_marker',
                    'n_cell_type_marker', 'n_hit', 'n_expected_hit',
                    'odds_ratio', 'p_value', 'organism', 'tissueType',
                    'cellName'
                ]
                item_data.sort_values(by=['cluster', 'p_value'], inplace=True)
                cells_type_file = os.path.join(gse_dir, 'cells_type.csv')
                with open(cells_type_file, 'w', encoding='utf8') as f:
                    item_data.to_csv(f, index=False)
                text = f'处理完毕: {gse}'
                print(add_color(text, 'green'))
            else:
                text = f'没有cluster可以标记cell_type: {gse}'
                print(add_color(text, 'yellow'))

    text = '所有GSE都处理完毕!'
    print(add_color(text, 'green'))
Beispiel #15
0
def check(target):
    """检测操作系统、cpu、内存、硬盘空间、网络"""

    try:
        print('====================')
        result = {}
        if target == 'operator_system':
            print('检查操作系统:')
            operator_system = platform.platform()
            text = add_color(operator_system, 'green')
            result['operator_system'] = operator_system
        elif target == 'cpu':
            print('检查CPU:')
            true_cpu = psutil.cpu_count(logical=False)
            logical_cpu = psutil.cpu_count(logical=True)
            text = '物理核数:%s  逻辑核数:%s' % (add_color(
                str(true_cpu), 'green'), add_color(str(logical_cpu), 'green'))
            result['true_cpu'] = true_cpu
            result['logical_cpu'] = logical_cpu
        elif target == 'memory':
            print('检查内存:')
            size = psutil.virtual_memory()
            free = round(size.free / 10**9, 3)
            used = round(size.used / 10**9, 3)
            text = '内存   free: %s  used: %s' % (add_color(
                str(free) + 'G', 'green'), add_color(str(used) + 'G', 'red'))
            result['used'] = used
            result['free'] = free
        elif target == 'device':
            print('检查硬盘:')
            print(add_color('在Linux下结果可能不准,在命令行中输入df -h查看硬盘', 'red'))
            all_devices = psutil.disk_partitions()
            text_list = []
            for device in all_devices:
                size = psutil.disk_usage(device.device)
                free = add_color(
                    str(round(size.free / 10**9, 3)) + 'G', 'green')
                used = add_color(str(round(size.used / 10**9, 3)) + 'G', 'red')
                text_list.append('%s   free: %s  used: %s' %
                                 (device.device, free, used))
            text = '\n'.join(text_list)
        elif target == 'network':
            print('检查网络:')
            print(add_color('Linux下如果不能自行停止请按Ctrl+C', 'red'))
            url = 'www.baidu.com'
            connect = os.system('ping %s' % url)
            if connect:
                text = add_color('%s 连接失败' % url, 'red')
            else:
                text = add_color('%s 连接成功' % url, 'green')
            result['connect'] = connect
        else:
            text = "target must be in {operator_system, cpu, memory, device, network}"

        print(text)
        return result
    except Exception:
        text = '无法检查当前操作系统的%s' % target
        print(add_color(text, 'red'))

        return None
Beispiel #16
0
def split_gse(gse_data, gse_organism_dict, organism_genes_dict, coding_data,
              ncoding_data):
    """切分表达矩阵获得编码和非编码两个文件"""

    print('切分表达矩阵获得编码和非编码两个文件')
    make_dir(coding_data)
    make_dir(ncoding_data)
    all_gse_data = os.listdir(gse_data)
    for count, gse in enumerate(all_gse_data, 1):
        print('========================================')
        gse_dir = os.path.join(gse_data, gse)
        gse_file = os.path.join(gse_dir, 'matrix.csv')
        if os.path.isdir(gse_dir) and not os.path.isfile(gse_file):
            # 存在文件夹,但文件夹里面没有matrix.csv会报错
            text = f'不存在{gse_file}'
            print(add_color(text, 'red'))
        else:
            if gse not in gse_organism_dict:
                text = f'GSE_info中没有{gse}的物种信息!'
                print(add_color(text, 'red'))
                continue

            organism = gse_organism_dict[gse].replace(' ', '_')
            if organism not in organism_genes_dict:
                text = f'{gse}: GENE_info中没有{organism}的基因信息!'
                print(add_color(text, 'red'))
                continue

            file_size = '%.3fM' % (os.path.getsize(gse_file) / (10**6))
            text = f'正在处理: {gse} {organism} {file_size} ({count}/{len(all_gse_data)})'
            print(add_color(text, 'yellow'))
            coding = organism_genes_dict[organism]['coding']
            ncoding = organism_genes_dict[organism]['ncoding']
            with open(gse_file) as f:
                matrix_data = pd.read_csv(f, index_col=0)

            # 判断表达矩阵行名中哪些是编码基因,哪些是非编码基因
            coding_genes = [
                gene for gene in matrix_data.index if gene in coding
            ]
            ncoding_genes = [
                gene for gene in matrix_data.index if gene in ncoding
            ]
            # 保存编码基因矩阵
            if coding_genes:
                print('找到%d个Coding genes' % len(coding_genes))
                coding_dir = os.path.join(coding_data, gse)
                coding_file = os.path.join(coding_dir, 'matrix.csv')
                make_dir(coding_dir)
                with open(coding_file, 'w') as f:
                    foo = matrix_data.loc[coding_genes, :]
                    foo.to_csv(f, sep=',')
            else:
                text = f'{gse_file}: 未发现Coding genes'
                print(add_color(text, 'yellow'))

            # 保存非编码基因矩阵
            if ncoding_genes:
                print('找到%d个Non coding genes' % len(ncoding_genes))
                ncoding_dir = os.path.join(ncoding_data, gse)
                ncoding_file = os.path.join(ncoding_dir, 'matrix.csv')
                make_dir(ncoding_dir)
                with open(ncoding_file, 'w') as f:
                    foo = matrix_data.loc[ncoding_genes, :]
                    foo.to_csv(f, sep=',')
            else:
                text = f'{gse_file}: 未发现Non coding genes'
                print(add_color(text, 'yellow'))

            text = f'处理完毕: {gse}'
            print(add_color(text, 'green'))
def check_last(srr_data):
    """检查操作系统,CPU,内存"""

    foo = input('是否需要检查环境:(y/n)')
    if foo != 'y':
        return
    # 检查操作系统
    result = check('operator_system')
    if result is not None:
        if 'Windows' in result['operator_system']:
            print(add_color('Windows下将不能运行此程序生成的脚本,请切换的Linux环境下!', 'red'))
            foo = input('继续?(y/n)')
            if foo != 'y':
                exit()

    # 检查CPU
    result = check('cpu')
    if result is not None:
        n_work = len(os.listdir(os.path.join(srr_data, 'distribution')))
        if TIANHE:
            print('在天河二号上运行,预计需要%d个节点' % n_work)
            print('如果大于60节点,要用BIOJOB分区的节点进行计算')
        else:
            expect_cpu = CPU_PER_WORKER * n_work
            print('预计需要物理核数:%s' % str(expect_cpu))
            if result['true_cpu'] < expect_cpu:
                print('如果CPU核数不够任务将来回切换,开销很大')
                print('可修改:(configure/CPU_PER_WORKER)')
            input('\nproceed')

    # 检查内存
    result = check('memory')
    if result is not None:
        cache_load_organism = {i.replace(' ', '_') for i in CACHE_LOAD}
        if cache_load_organism:
            print('%s基因组STAR_Index保留在内存以加速处理' % str(cache_load_organism))
            print('每个哺乳动物的Index大约30G')
            if result['free'] < len(cache_load_organism) * 30 + 30:
                print('如果内存不够将使用虚拟内存,速度会较慢')
                print('可修改:(configure/CACHE_LOAD)')
        input('\nproceed')

    # 检查硬盘
    check('device')
    print('检查最大的几个SRR文件')
    data_path = os.path.join(srr_data, 'data')
    result_path = os.path.join(srr_data, 'result')
    data_size = {}
    finished_srr = {
        srr + '.sra'
        for walk in os.walk(result_path) for srr in walk[2]
        if srr.startswith('SRR')
    }
    all_srr = list(
        {srr
         for srr in os.listdir(data_path) if srr.endswith('.sra')} -
        finished_srr)
    for srr in all_srr:
        data_size[srr] = os.path.getsize(os.path.join(data_path, srr)) / (10**
                                                                          9)
    # 输出data中最大的十个文件(不计算那些已经在result里的了)
    maxsize_data = sorted(data_size.keys(),
                          key=lambda x: data_size[x],
                          reverse=True)[:10]
    print('======================')
    for srr in maxsize_data:
        print('%s : %.2fG' % (srr, data_size[srr]))
    print('======================')
    print('临时文件fastq:srr转换为fastq格式后大小扩大约4倍')
    print('临时文件sam:fastq转换为sam格式后大小扩大1至3倍不等')
    print('最终文件feature:典型的人类的一个样本大概29M,小鼠为19M,其它物种较小')
    print('多个任务同时处理时将会有多个临时文件,请确保硬盘空间足够')
    input('\nproceed')
def distribute_srr(srr_dict, srr_data, print_detail=PRINT_DETAIL):
    """为worker分配工作"""

    data_path = os.path.join(srr_data, 'data')
    result_path = os.path.join(srr_data, 'result')

    # 只处理后缀是.sra的文件
    all_srr = [srr for srr in os.listdir(srr_data) if srr.endswith('.sra')]
    # 将文件移动到data目录下
    print('正在移动文件')
    for count, srr in enumerate(all_srr, 1):
        from_path = os.path.join(srr_data, srr)
        to_path = os.path.join(data_path, srr)
        shutil.move(from_path, to_path)
        if print_detail and count % 300 == 0:
            print('处理进度:%d/%d' % (count, len(all_srr)))

    # 对于在result中的,即已经处理好的srr不再处理来,先剔除掉,做差集
    finished_srr = {
        srr + '.sra'
        for walk in os.walk(result_path) for srr in walk[2]
        if srr.startswith('SRR')
    }
    all_srr = list(
        {srr
         for srr in os.listdir(data_path) if srr.endswith('.sra')} -
        finished_srr)
    print('正在分配%d个SRR文件' % len(all_srr))
    file_name_count = {}
    file_list = {}
    error = []
    for count, srr in enumerate(all_srr, 1):

        if print_detail and count % 300 == 0:
            print('处理进度:%d/%d' % (count, len(all_srr)))

        srr_name = srr.split('.')[0]
        try:
            # 从srr_dict中得到SRR所属的物种,注意要将空格改为下划线
            organism = srr_dict[srr_name].replace(' ', '_')
        except KeyError:
            # 未找到所属物种的srr文件将不会经过后续处理
            error.append(srr_name)
            continue

        # 这段代码在给每个worker分配任务,每个worker处理的任务来自同一个物种
        if organism not in file_name_count:
            file_name_count[organism] = 1
            file_list[organism] = []
        if len(file_list[organism]) >= TASK_PER_WORKER:
            # 当任务计数大于task_per_worker时,分配给下一个worker
            # 将任务记录在distribution的文件里
            file_name = organism + '_' + str(file_name_count[organism])
            file_name = os.path.join(srr_data, 'distribution', file_name)
            with open(file_name, 'w') as f:
                f.write('\n'.join(file_list[organism]))

            file_name_count[organism] += 1  # 文件名计数加一
            file_list[organism] = []  # 列表清空
        file_list[organism].append(srr)

    # 将最后的不足task_per_worker个的任务分配给worker
    for organism, srr_list in file_list.items():
        if srr_list:
            file_name = organism + '_' + str(file_name_count[organism])
            file_name = os.path.join(srr_data, 'distribution', file_name)
            with open(file_name, 'w') as f:
                f.write('\n'.join(srr_list))

    if error:
        text = '以下SRR文件未找到所属物种,请将信息添加到SRR_info\n%s' % str(error)
        print(add_color(text, 'red'))
    text = '分配任务完毕,在%s' % os.path.abspath(
        os.path.join(srr_data, 'distribution'))
    print(add_color(text, 'green'))
Beispiel #19
0
def integrate_srr(gse_dict, srr_data, gse_data, print_detail=PRINT_DETAIL):
    """按GSE整合SRR"""

    result_path = os.path.join(srr_data, 'result')
    all_result_path = os.path.join(result_path, 'all_result')
    all_result_srr = set(os.listdir(all_result_path))

    for gse_count, (gse, srr_list) in enumerate(gse_dict.items(), 1):
        print(f'===================={gse_count}/{len(gse_dict)}====================')
        finished_srr = [srr for srr in srr_list if srr in all_result_srr]

        # ============输出处理报告============
        completion = f'({len(finished_srr)}/{len(srr_list)})'
        if len(finished_srr) == 0:
            text = f'{gse}中的SRR完全没有被处理{completion}!'
            print(add_color(text, 'red'))
        else:
            # 输出每个GSE中SRR处理信息
            error_srr = set(srr_list) - set(finished_srr)
            if len(error_srr) == 0:
                text = f'{gse}中的SRR处理完全{completion}!'
                print(add_color(text, 'green'))
            else:
                text = f'{gse}中的SRR缺失{completion}!'
                print(add_color(text, 'yellow'))
                if print_detail:
                    print(add_color(error_srr, 'yellow'))

            # ============对完成度大于阈值的GSE进行整合============
            # 一个GSE中的SRR处理完成度大于这个阈值才会整合成表达矩阵
            if len(finished_srr) / len(srr_list) < THRESHOLD:
                text = f'{gse}中SRR缺失过多,不予整合!'
                print(add_color(text, 'yellow'))
            else:
                gse_dir = os.path.join(gse_data, gse)
                make_dir(gse_dir)
                matrix_file = os.path.join(gse_dir, 'matrix.csv')

                # 如果当前GSE已经存在一个matrix.csv文件
                # 会检查处理好的srr是否已经写好了,从而避免多次处理
                if os.path.exists(matrix_file):
                    with open(matrix_file, 'r', encoding='utf8') as f:
                        matrix_srr = [srr for srr in f.readline().strip().split(',') if srr]
                        if set(matrix_srr) == set(finished_srr):
                            print('已找到整合好的matrix.csv文件!')
                            continue

                # 首先抽一个文件读取gene的列表
                # 经过验证,每个featureCounts出来的文件基因的次序和个数是相同的
                with open(os.path.join(all_result_path, finished_srr[0]), 'r', encoding='utf8') as f:
                    # 读掉开始两行
                    f.readline()
                    f.readline()
                    genes_length = []
                    genes_list = []
                    for line in f:
                        genes_length.append(int(line.strip().split('\t')[-2]))
                        genes_list.append(line.strip().split('\t')[0])

                # 多进程整合SRR成GSE的表达矩阵
                print('开启%d个进程整合SRR数据!' % N_INTEGRATION)
                n_worker, srr_per_worker = distribute_srr(finished_srr, n_worker=N_INTEGRATION)
                pool = Pool(processes=n_worker)
                new_integration_worker = partial(integration_worker, all_result_path=all_result_path)
                result = pool.map(new_integration_worker, srr_per_worker)
                gse_data_dict = {key: every_dict[key] for every_dict in result for key in every_dict}
                gse_matrix = pd.DataFrame(gse_data_dict, index=genes_list)  # 创建一个DataFrame写入表达矩阵
                if VALUE == 'RPKM':
                    cells_numi = gse_matrix.sum(axis=0)
                    gse_matrix = gse_matrix.div(cells_numi, axis=1).div(genes_length, axis=0) * 10**9
                elif VALUE == 'TPM':
                    foo = gse_matrix.div(genes_length, axis=0) * 1000
                    foo_numi = foo.sum(axis=0)
                    gse_matrix = foo.div(foo_numi) * 10**6
                print('整合完毕,保存数据中!')

                with open(matrix_file, 'w', encoding='utf8') as f:
                    # TPM、RPKM的数量级最小大概就是三位小数,所以文件保存保留三位小数
                    gse_matrix.to_csv(f, sep=',', header=True, index=True, float_format='%.3f')
                    text = '保存成功:%s' % os.path.abspath(matrix_file)
                    print(add_color(text, 'green'))