def get_gse_dict(srr_file, sep=SEP): """读取gse:srr的字典""" print('读取SRR_INFO:%s' % srr_file) with open(srr_file, 'r', encoding='utf8') as f: srr_info = pd.read_csv(f, sep=sep) if srr_info.shape[0] == 0: print('%s:文件里什么信息都没有!' % srr_file) exit() gse_dict = {} srr_count = 0 for group in srr_info.groupby('GSE'): gse = group[0] data = group[1] if len(set(data['organism'])) > 1: text = f'{gse}中的SRR属于多个物种,将不予处理!' print(add_color(text, 'yellow')) else: # 对每个GSE的SRR排下序 srr_list = data['SRR'].tolist() srr_list.sort() gse_dict[gse] = srr_list srr_count += len(data['SRR']) print('读取到%d个GSE的%d个SRR信息' % (len(gse_dict), srr_count)) if len(gse_dict) == 0: exit() return gse_dict
def get_gse_summary(gse_organism_dict, gse_data): """统计GSE处理结果,行是基因,列是样本""" print('开始统计GSE处理结果') gse_summary_dict = {} for gse in os.listdir(gse_data): gse_dir = os.path.join(gse_data, gse) gse_file = os.path.join(gse_dir, 'matrix.csv') gse_pca_file = os.path.join(gse_dir, 'pca.csv') if os.path.isdir(gse_dir): # ==========判断GSE的物种========== if gse in gse_organism_dict: organism = gse_organism_dict[gse] else: organism = 'None' print(add_color(f'不存在{gse}的物种信息', 'yellow')) # ==========计算GSE表达矩阵的基因数目和细胞数目========== if os.path.isfile(gse_file): with open(gse_file, 'r', encoding='utf8') as f: samples = len(f.readline().strip().split(',')) - 1 genes = 0 for _ in f: genes += 1 else: samples = genes = 'None' print(add_color(f'不存在{gse_file}', 'yellow')) # ==========计算聚类数目========== if os.path.isfile(gse_pca_file): with open(gse_pca_file, 'r', encoding='utf8') as f: gse_pca_data = pd.read_csv(f, index_col=0) clusters = len(set(gse_pca_data['cluster'][1:])) else: clusters = 'None' print(add_color(f'不存在{gse_pca_file}', 'yellow')) gse_summary_dict[gse] = { 'organism': organism, 'samples': samples, 'genes': genes, 'clusters': clusters } print('统计到了%d个GSE处理的结果' % len(gse_summary_dict)) gse_summary = pd.DataFrame(gse_summary_dict).reindex( ['organism', 'samples', 'genes', 'clusters']).transpose() gse_summary.index.name = 'GSE' return gse_summary
def generate_script(srr_data, output_file): """生成处理的脚本""" print('正在批量生成处理脚本') # 对每个worker生成一个脚本 script_header = '#!/bin/bash\n' script_template = PYTHON_PATH + ' {worker_file} {task}' script_list = [] script_path = os.path.abspath(os.path.join(srr_data, 'script')) distribution_path = os.path.abspath(os.path.join(srr_data, 'distribution')) if TIANHE: worker_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'srr_worker_tianhe.py') else: worker_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'srr_worker.py') for i in os.listdir(distribution_path): task = os.path.join(distribution_path, i) file_name = 'Run_' + i + '.sh' # 一个sh脚本运行一个distribution里面的文件,即一个task file_path = os.path.join(script_path, file_name) with open(file_path, 'w') as f: f.write(script_header) script_body = script_template.replace('{worker_file}', worker_file).replace( '{task}', task) f.write(script_body) os.system('chmod 755 ' + file_path) script_list.append(file_path) text = '所有脚本生成完毕:%s' % os.path.abspath(os.path.join(srr_data, 'script')) print(add_color(text, 'green')) # 生成一个总脚本 with open(output_file, 'w') as f: f.write(script_header) for script in script_list: if TIANHE: command = 'yhbatch -N 1 -n 1 -p %s %s' % (TIANHE_NODE_NAME, script) else: command = script f.write(command + ' &\n') os.system('chmod 755 ' + output_file) text = '总脚本生成完毕:%s' % os.path.abspath(output_file) print(add_color(text, 'green'))
def integration_worker(sub_finished_srr, all_result_path): """对分配的SRR子任务进行读取""" pid = os.getpid() sub_gse_data_dict = {} for count, srr in enumerate(sub_finished_srr, 1): if count % 300 == 0 and PRINT_DETAIL: print(f'进程: {pid} 完成: {count}/{len(sub_finished_srr)}') try: with open(os.path.join(all_result_path, srr)) as f: # 读掉开始两行 f.readline() f.readline() expression = [int(line.strip().split('\t')[-1]) for line in f] sub_gse_data_dict[srr] = expression except Exception as e: text = '处理错误: ' + srr print(add_color(text, 'red')) print(add_color(str(e), 'red')) return sub_gse_data_dict
def gse_handle(sub_gse_list, gse_data): """单个进程对传入的GSE列表进行处理""" pid = str(os.getpid()) for count, gse in enumerate(sub_gse_list, 1): prefix = f'进程: {pid}\t任务: {count}/{len(sub_gse_list)}\t' gse_file = os.path.abspath(os.path.join(gse_data, gse, 'matrix.csv')) if not os.path.exists(gse_file): text = prefix + '文件不存在: ' + gse_file print(add_color(text, 'red')) else: file_size = '%.2fM' % (os.path.getsize(gse_file) / (10**6)) text = prefix + '开始处理: ' + gse + '\t大小: ' + file_size print(add_color(text, 'yellow')) try: project = Project(gse_file) project.compute_all_steps() text = prefix + '处理完毕: ' + gse print(add_color(text, 'green')) except Exception as e: text = prefix + '处理错误: ' + gse print(add_color(text, 'red')) print(add_color(str(e), 'red')) text = f'进程: {pid}已结束所有任务!' print(add_color(text, 'green'))
def check_fist(): """检查软件和数据是否正确""" print('正在检查软件和数据路径是否有误') all_right = True # 检查数据和软件 for name, value in zip( ['GENOME', 'GENOME_INDEX', 'FASTERQ_DUMP', 'STAR', 'FEATURE_COUNTS'], [GENOME, GENOME_INDEX, FASTERQ_DUMP, STAR, FEATURE_COUNTS]): if not os.path.exists(value): print(add_color('×\t' + name, 'red'), end='\t') print(add_color('可修改:configure/' + name, 'red')) all_right = False else: print(add_color('√\t' + name, 'green')) # 单独检查python及版本 foo = os.popen(PYTHON_PATH + ' -V').read().strip() if foo == '' or foo.split()[1].startswith('2'): print(add_color('×\tPYTHON_PATH', 'red'), end='\t') print(add_color('可修改:configure/PYTHON_PATH', 'red')) all_right = False else: print(add_color('√\tPYTHON_PATH', 'green')) if not all_right: exit()
def download_srr(srr_list, output_dir): """使用生产者消费者模型下载数据""" print('开始下载') make_dir(output_dir) # 创建文件夹 # 启动生产者 queue_size = 100 queue = Queue(queue_size) pro = Process(target=producer, args=(srr_list, queue, PRINT_DETAIL, queue_size)) pro.start() try: # 启动消费者队列 consumer_list = [] new_consumer = partial(consumer, path=output_dir) for consumer_name in range(1, N_DOWNLOAD + 1): con = Process(target=new_consumer, args=(consumer_name, queue, PRINT_DETAIL, wget_srr)) con.start() consumer_list.append(con) # 等待消费者执行完毕 for con in consumer_list: con.join() except KeyboardInterrupt: print(add_color('如果是Windows下的话按两下Ctrl+C, 父进程和子进程全部马上结束', 'red')) print(add_color('如果是Linux下的话按两下Ctrl+C,等待子进程完成最后一个任务才会退出', 'red')) # 输出错误报告 finished = {i.split('.')[0] for i in os.listdir(output_dir)} error = set(srr_list) - finished if error: print('%s个SRR没有下载' % add_color(len(error), 'red')) print(add_color(error, 'red')) else: print(add_color('所有SRR都被下载', 'green')) # 子进程必须要外部停止才能关掉 print('按Ctrl+C退出')
def wget_srr(srr, print_detail, kwargs): """在命令行执行下载srr""" path = kwargs['path'] # 如果当前目录已存在文件,则不用下载 if os.path.exists(os.path.join(path, srr + '.sra')): return # 拼接形成下载srr的url url_split = [DOWNLOAD_PREFIX, srr[:6], srr, srr + '.sra'] url = '/'.join(url_split) try: # -c是断点续传 command_list = ['wget -c', str(url)] if not PRINT_WGET_DETAIL: command_list.append('-q') command_list.extend(['-P', str(path)]) command = ' '.join(command_list) info = os.system(command) if print_detail: if info: text = add_color('× ' + url, 'red') # 删掉下载错误的文件 remove_file(os.path.join(path, srr + '.sra')) else: text = add_color('√ ' + url, 'green') print(text) except KeyboardInterrupt: # 这条try-except只会在Windows下会执行 # Windows下子进程先捕获到KeyboardInterrupt,然后传给父进程 # 而Linux是父进程先捕获到KeyboardInterrupt,然后子进程变成孤儿进程继续运行 # 我在这里用try-except专门处理Windows下的情况,在consumer函数中专门处理Linux的情况 # 删掉中断下载时那个没下载完的文件 remove_file(os.path.join(path, srr + '.sra')) raise KeyboardInterrupt
def final_work(): """做一些事后处理工作""" # ====================从内存中释放物种的STAR_Index==================== if cache_load: command_list = [ star, '--genomeLoad', 'Remove', '--genomeDir', organism_genome_index ] command = ' '.join(command_list) os.system(command) text = 'release memory of %s Index' % organism print(add_color(text, 'yellow')) # ====================删除一些在当前目录下无缘无故出来的文件==================== remove_file('./Aligned.out.sam') remove_file('./Log.out') remove_file('./Log.progress.out') os.system('rm -r _STARtmp')
def get_cell_marker_dict(cell_marker, sep): """获取含有cell_marker的数据,返回物种每种细胞类型的marker基因""" print('读取cell_marker:', cell_marker) with open(cell_marker, encoding='utf8') as f: cell_marker_data = pd.read_csv(f, sep=sep) columns = cell_marker_data.columns for col_name in ['speciesType', 'tissueType', 'cellName', 'ensemblID']: try: assert col_name in columns except AssertionError: text = 'cell_marker must have column: ' + col_name print(add_color(text, 'red')) exit() cell_marker_dict = {} for organism, data in cell_marker_data.groupby('speciesType'): organism_cell_dict = {} for _, cell in data.iterrows(): cell_type = (cell['tissueType'], cell['cellName']) gene_set = set(cell['ensemblID'].strip().split(',')) organism_cell_dict[cell_type] = gene_set cell_marker_dict[organism] = organism_cell_dict print('读取到下面物种的marker基因信息: \n%s' % set(cell_marker_dict.keys())) if len(cell_marker_dict) == 0: exit() # 计算每个物种所有marker gene sum_dict = {} for organism in cell_marker_dict: organism_all_gene_set = set() for cell_type, gene_set in cell_marker_dict[organism].items(): organism_all_gene_set.update(gene_set) sum_dict[organism] = organism_all_gene_set cell_marker_dict['all'] = sum_dict return cell_marker_dict
def producer(task, queue, print_detail, queue_size): """将任务放进队列,在进程间共享""" targets = task[:] n_task = len(targets) n_put = -queue_size n_rest = queue_size + len(task) while True: n_rest -= 1 n_put += 1 if targets: queue.put(targets.pop()) else: # 队列里的任务分发完了,就向队列中发送终止信号 queue.put('close') # 输出已分发任务个数和已完成的任务 if 0 < n_put <= n_task and print_detail: text = add_color('分发第%d个任务,剩余%d个任务' % (n_put, n_rest), 'green') print(text)
def srr_pool(srr_data): """将所有result文件下以SRR开头的文件移动到all_result_path中""" result_path = os.path.join(srr_data, 'result') all_result_path = os.path.join(result_path, 'all_result') make_dir(all_result_path) print('正在移动文件') for dir_path, _, _ in os.walk(result_path): abs_all_result_path = os.path.abspath(all_result_path) abs_dir_path = os.path.abspath(dir_path) if abs_dir_path != abs_all_result_path: file_list = [srr for srr in os.listdir(dir_path) if srr.startswith('SRR')] if len(file_list) > 0: print(f'{abs_dir_path}/*\t>\t{abs_all_result_path}/') command = f'mv {abs_dir_path}/* {abs_all_result_path}/' os.system(command) if len(os.listdir(all_result_path)) == 0: text = f'{result_path}中没有数据!' print(add_color(text, 'red')) exit()
def main_work(task_file): """处理一个任务列表""" first_work(task_file) # ====================开始处理task_list里面的所有任务==================== # 开始处理task_list里面的所有任务 error = [] for count, srr in enumerate(task_list, 1): srr_file = os.path.join(data_path, srr) file_size = round(os.path.getsize(srr_file) / 10**9, 2) text = 'start to handle: %s -- %s size: %sG (%d/%d)' % ( organism, srr, str(file_size), count, len(task_list)) print(add_color(text, 'yellow')) foo, command = srr_handle(srr) if foo: text = 'error in %s -- %s\n command: %s' % (organism, srr, command) print(add_color(text, 'red')) error.append(srr) else: text = 'success in : %s -- %s' % (organism, srr) print(add_color(text, 'green')) # 把中间产生的临时文件及时删了,防止影响下步处理 if len(os.listdir(temp_path)) > 0: os.system('rm -r ' + os.path.join(temp_path, '*')) # ====================输出这个进程处理的报告==================== text = 'task: %s finished' % task_file print(add_color(text, 'green')) if error: text = '%s do not handle correctly' % str(error) print(add_color(text, 'red')) else: text = 'all srr have finished correctly' print(add_color(text, 'green')) # ======================================== final_work()
def gse_marker_handle(gse_data, gse_organism_dict, cell_marker_dict, odds_ratio_threshold=2, p_value_threshold=0.01, method='greater'): """利用Fisher exact text检验为聚类结果标明类别""" assert method in {'two-sided', 'less', 'greater'} all_gse_data = os.listdir(gse_data) for count, gse in enumerate(all_gse_data, 1): print('========================================') gse_dir = os.path.join(gse_data, gse) marker_genes_file = os.path.join(gse_dir, 'marker_genes.csv') if os.path.isdir(gse_dir) and not os.path.isfile(marker_genes_file): # 存在文件夹,但文件夹里面没有matrix.csv会报错 text = f'不存在{marker_genes_file}' print(add_color(text, 'red')) else: if gse not in gse_organism_dict: text = f'GSE_info中没有{gse}的物种信息!' print(add_color(text, 'red')) continue organism = gse_organism_dict[gse].replace(' ', '_') if organism not in cell_marker_dict: text = f'{gse}: cell_marker中没有{organism}的marker基因信息!' print(add_color(text, 'red')) continue text = f'正在处理: {gse} {organism} ({count}/{len(all_gse_data)})' print(add_color(text, 'yellow')) with open(marker_genes_file, 'r', encoding='utf8') as f: marker_genes_data = pd.read_csv(f, sep=',') item_list = [] all_marker = cell_marker_dict['all'][organism] # 总marker for cluster, data in marker_genes_data.groupby('cluster'): cluster_marker = set( data['gene']) & all_marker # 某个cluster的marker基因 n_all_marker = len(all_marker) n_cluster_marker = len(cluster_marker) if n_cluster_marker == 0: continue cluster_marker_prop = n_cluster_marker / n_all_marker # cluster的marker占总marker的百分比 for cell_type, cell_type_marker in cell_marker_dict[ organism].items(): n_cell_type_marker = len( cell_type_marker) # 某一cell_type的marker基因数目 # 随机情况下期望cluster的marker基因hit到cell_type的marker基因的数目 n_expected_hit = cluster_marker_prop * n_cell_type_marker n_hit = len(cluster_marker & cell_type_marker) # 实际hit到的数目 odds_ratio = n_hit / n_expected_hit # 实际hit到的高于随机hit到的比例 if odds_ratio > odds_ratio_threshold: # 构建列联表并进行Fisher exact test n_non_hit_cell_type_marker = n_cell_type_marker - n_hit n_non_hit_cluster_marker = n_cell_type_marker - n_hit n_other_marker = n_all_marker - n_hit - n_non_hit_cell_type_marker - n_non_hit_cluster_marker table = [[n_other_marker, n_non_hit_cell_type_marker], [n_non_hit_cluster_marker, n_hit]] p_value = stats.fisher_exact(table, method)[1] if p_value < p_value_threshold: item = [ cluster, n_all_marker, n_cluster_marker, n_cell_type_marker, n_hit, n_expected_hit, odds_ratio, p_value, organism, cell_type[0], cell_type[1] ] item_list.append(item) if item_list: item_data = pd.DataFrame(item_list) item_data.columns = [ 'cluster', 'n_all_marker', 'n_cluster_marker', 'n_cell_type_marker', 'n_hit', 'n_expected_hit', 'odds_ratio', 'p_value', 'organism', 'tissueType', 'cellName' ] item_data.sort_values(by=['cluster', 'p_value'], inplace=True) cells_type_file = os.path.join(gse_dir, 'cells_type.csv') with open(cells_type_file, 'w', encoding='utf8') as f: item_data.to_csv(f, index=False) text = f'处理完毕: {gse}' print(add_color(text, 'green')) else: text = f'没有cluster可以标记cell_type: {gse}' print(add_color(text, 'yellow')) text = '所有GSE都处理完毕!' print(add_color(text, 'green'))
def check(target): """检测操作系统、cpu、内存、硬盘空间、网络""" try: print('====================') result = {} if target == 'operator_system': print('检查操作系统:') operator_system = platform.platform() text = add_color(operator_system, 'green') result['operator_system'] = operator_system elif target == 'cpu': print('检查CPU:') true_cpu = psutil.cpu_count(logical=False) logical_cpu = psutil.cpu_count(logical=True) text = '物理核数:%s 逻辑核数:%s' % (add_color( str(true_cpu), 'green'), add_color(str(logical_cpu), 'green')) result['true_cpu'] = true_cpu result['logical_cpu'] = logical_cpu elif target == 'memory': print('检查内存:') size = psutil.virtual_memory() free = round(size.free / 10**9, 3) used = round(size.used / 10**9, 3) text = '内存 free: %s used: %s' % (add_color( str(free) + 'G', 'green'), add_color(str(used) + 'G', 'red')) result['used'] = used result['free'] = free elif target == 'device': print('检查硬盘:') print(add_color('在Linux下结果可能不准,在命令行中输入df -h查看硬盘', 'red')) all_devices = psutil.disk_partitions() text_list = [] for device in all_devices: size = psutil.disk_usage(device.device) free = add_color( str(round(size.free / 10**9, 3)) + 'G', 'green') used = add_color(str(round(size.used / 10**9, 3)) + 'G', 'red') text_list.append('%s free: %s used: %s' % (device.device, free, used)) text = '\n'.join(text_list) elif target == 'network': print('检查网络:') print(add_color('Linux下如果不能自行停止请按Ctrl+C', 'red')) url = 'www.baidu.com' connect = os.system('ping %s' % url) if connect: text = add_color('%s 连接失败' % url, 'red') else: text = add_color('%s 连接成功' % url, 'green') result['connect'] = connect else: text = "target must be in {operator_system, cpu, memory, device, network}" print(text) return result except Exception: text = '无法检查当前操作系统的%s' % target print(add_color(text, 'red')) return None
def split_gse(gse_data, gse_organism_dict, organism_genes_dict, coding_data, ncoding_data): """切分表达矩阵获得编码和非编码两个文件""" print('切分表达矩阵获得编码和非编码两个文件') make_dir(coding_data) make_dir(ncoding_data) all_gse_data = os.listdir(gse_data) for count, gse in enumerate(all_gse_data, 1): print('========================================') gse_dir = os.path.join(gse_data, gse) gse_file = os.path.join(gse_dir, 'matrix.csv') if os.path.isdir(gse_dir) and not os.path.isfile(gse_file): # 存在文件夹,但文件夹里面没有matrix.csv会报错 text = f'不存在{gse_file}' print(add_color(text, 'red')) else: if gse not in gse_organism_dict: text = f'GSE_info中没有{gse}的物种信息!' print(add_color(text, 'red')) continue organism = gse_organism_dict[gse].replace(' ', '_') if organism not in organism_genes_dict: text = f'{gse}: GENE_info中没有{organism}的基因信息!' print(add_color(text, 'red')) continue file_size = '%.3fM' % (os.path.getsize(gse_file) / (10**6)) text = f'正在处理: {gse} {organism} {file_size} ({count}/{len(all_gse_data)})' print(add_color(text, 'yellow')) coding = organism_genes_dict[organism]['coding'] ncoding = organism_genes_dict[organism]['ncoding'] with open(gse_file) as f: matrix_data = pd.read_csv(f, index_col=0) # 判断表达矩阵行名中哪些是编码基因,哪些是非编码基因 coding_genes = [ gene for gene in matrix_data.index if gene in coding ] ncoding_genes = [ gene for gene in matrix_data.index if gene in ncoding ] # 保存编码基因矩阵 if coding_genes: print('找到%d个Coding genes' % len(coding_genes)) coding_dir = os.path.join(coding_data, gse) coding_file = os.path.join(coding_dir, 'matrix.csv') make_dir(coding_dir) with open(coding_file, 'w') as f: foo = matrix_data.loc[coding_genes, :] foo.to_csv(f, sep=',') else: text = f'{gse_file}: 未发现Coding genes' print(add_color(text, 'yellow')) # 保存非编码基因矩阵 if ncoding_genes: print('找到%d个Non coding genes' % len(ncoding_genes)) ncoding_dir = os.path.join(ncoding_data, gse) ncoding_file = os.path.join(ncoding_dir, 'matrix.csv') make_dir(ncoding_dir) with open(ncoding_file, 'w') as f: foo = matrix_data.loc[ncoding_genes, :] foo.to_csv(f, sep=',') else: text = f'{gse_file}: 未发现Non coding genes' print(add_color(text, 'yellow')) text = f'处理完毕: {gse}' print(add_color(text, 'green'))
def check_last(srr_data): """检查操作系统,CPU,内存""" foo = input('是否需要检查环境:(y/n)') if foo != 'y': return # 检查操作系统 result = check('operator_system') if result is not None: if 'Windows' in result['operator_system']: print(add_color('Windows下将不能运行此程序生成的脚本,请切换的Linux环境下!', 'red')) foo = input('继续?(y/n)') if foo != 'y': exit() # 检查CPU result = check('cpu') if result is not None: n_work = len(os.listdir(os.path.join(srr_data, 'distribution'))) if TIANHE: print('在天河二号上运行,预计需要%d个节点' % n_work) print('如果大于60节点,要用BIOJOB分区的节点进行计算') else: expect_cpu = CPU_PER_WORKER * n_work print('预计需要物理核数:%s' % str(expect_cpu)) if result['true_cpu'] < expect_cpu: print('如果CPU核数不够任务将来回切换,开销很大') print('可修改:(configure/CPU_PER_WORKER)') input('\nproceed') # 检查内存 result = check('memory') if result is not None: cache_load_organism = {i.replace(' ', '_') for i in CACHE_LOAD} if cache_load_organism: print('%s基因组STAR_Index保留在内存以加速处理' % str(cache_load_organism)) print('每个哺乳动物的Index大约30G') if result['free'] < len(cache_load_organism) * 30 + 30: print('如果内存不够将使用虚拟内存,速度会较慢') print('可修改:(configure/CACHE_LOAD)') input('\nproceed') # 检查硬盘 check('device') print('检查最大的几个SRR文件') data_path = os.path.join(srr_data, 'data') result_path = os.path.join(srr_data, 'result') data_size = {} finished_srr = { srr + '.sra' for walk in os.walk(result_path) for srr in walk[2] if srr.startswith('SRR') } all_srr = list( {srr for srr in os.listdir(data_path) if srr.endswith('.sra')} - finished_srr) for srr in all_srr: data_size[srr] = os.path.getsize(os.path.join(data_path, srr)) / (10** 9) # 输出data中最大的十个文件(不计算那些已经在result里的了) maxsize_data = sorted(data_size.keys(), key=lambda x: data_size[x], reverse=True)[:10] print('======================') for srr in maxsize_data: print('%s : %.2fG' % (srr, data_size[srr])) print('======================') print('临时文件fastq:srr转换为fastq格式后大小扩大约4倍') print('临时文件sam:fastq转换为sam格式后大小扩大1至3倍不等') print('最终文件feature:典型的人类的一个样本大概29M,小鼠为19M,其它物种较小') print('多个任务同时处理时将会有多个临时文件,请确保硬盘空间足够') input('\nproceed')
def distribute_srr(srr_dict, srr_data, print_detail=PRINT_DETAIL): """为worker分配工作""" data_path = os.path.join(srr_data, 'data') result_path = os.path.join(srr_data, 'result') # 只处理后缀是.sra的文件 all_srr = [srr for srr in os.listdir(srr_data) if srr.endswith('.sra')] # 将文件移动到data目录下 print('正在移动文件') for count, srr in enumerate(all_srr, 1): from_path = os.path.join(srr_data, srr) to_path = os.path.join(data_path, srr) shutil.move(from_path, to_path) if print_detail and count % 300 == 0: print('处理进度:%d/%d' % (count, len(all_srr))) # 对于在result中的,即已经处理好的srr不再处理来,先剔除掉,做差集 finished_srr = { srr + '.sra' for walk in os.walk(result_path) for srr in walk[2] if srr.startswith('SRR') } all_srr = list( {srr for srr in os.listdir(data_path) if srr.endswith('.sra')} - finished_srr) print('正在分配%d个SRR文件' % len(all_srr)) file_name_count = {} file_list = {} error = [] for count, srr in enumerate(all_srr, 1): if print_detail and count % 300 == 0: print('处理进度:%d/%d' % (count, len(all_srr))) srr_name = srr.split('.')[0] try: # 从srr_dict中得到SRR所属的物种,注意要将空格改为下划线 organism = srr_dict[srr_name].replace(' ', '_') except KeyError: # 未找到所属物种的srr文件将不会经过后续处理 error.append(srr_name) continue # 这段代码在给每个worker分配任务,每个worker处理的任务来自同一个物种 if organism not in file_name_count: file_name_count[organism] = 1 file_list[organism] = [] if len(file_list[organism]) >= TASK_PER_WORKER: # 当任务计数大于task_per_worker时,分配给下一个worker # 将任务记录在distribution的文件里 file_name = organism + '_' + str(file_name_count[organism]) file_name = os.path.join(srr_data, 'distribution', file_name) with open(file_name, 'w') as f: f.write('\n'.join(file_list[organism])) file_name_count[organism] += 1 # 文件名计数加一 file_list[organism] = [] # 列表清空 file_list[organism].append(srr) # 将最后的不足task_per_worker个的任务分配给worker for organism, srr_list in file_list.items(): if srr_list: file_name = organism + '_' + str(file_name_count[organism]) file_name = os.path.join(srr_data, 'distribution', file_name) with open(file_name, 'w') as f: f.write('\n'.join(srr_list)) if error: text = '以下SRR文件未找到所属物种,请将信息添加到SRR_info\n%s' % str(error) print(add_color(text, 'red')) text = '分配任务完毕,在%s' % os.path.abspath( os.path.join(srr_data, 'distribution')) print(add_color(text, 'green'))
def integrate_srr(gse_dict, srr_data, gse_data, print_detail=PRINT_DETAIL): """按GSE整合SRR""" result_path = os.path.join(srr_data, 'result') all_result_path = os.path.join(result_path, 'all_result') all_result_srr = set(os.listdir(all_result_path)) for gse_count, (gse, srr_list) in enumerate(gse_dict.items(), 1): print(f'===================={gse_count}/{len(gse_dict)}====================') finished_srr = [srr for srr in srr_list if srr in all_result_srr] # ============输出处理报告============ completion = f'({len(finished_srr)}/{len(srr_list)})' if len(finished_srr) == 0: text = f'{gse}中的SRR完全没有被处理{completion}!' print(add_color(text, 'red')) else: # 输出每个GSE中SRR处理信息 error_srr = set(srr_list) - set(finished_srr) if len(error_srr) == 0: text = f'{gse}中的SRR处理完全{completion}!' print(add_color(text, 'green')) else: text = f'{gse}中的SRR缺失{completion}!' print(add_color(text, 'yellow')) if print_detail: print(add_color(error_srr, 'yellow')) # ============对完成度大于阈值的GSE进行整合============ # 一个GSE中的SRR处理完成度大于这个阈值才会整合成表达矩阵 if len(finished_srr) / len(srr_list) < THRESHOLD: text = f'{gse}中SRR缺失过多,不予整合!' print(add_color(text, 'yellow')) else: gse_dir = os.path.join(gse_data, gse) make_dir(gse_dir) matrix_file = os.path.join(gse_dir, 'matrix.csv') # 如果当前GSE已经存在一个matrix.csv文件 # 会检查处理好的srr是否已经写好了,从而避免多次处理 if os.path.exists(matrix_file): with open(matrix_file, 'r', encoding='utf8') as f: matrix_srr = [srr for srr in f.readline().strip().split(',') if srr] if set(matrix_srr) == set(finished_srr): print('已找到整合好的matrix.csv文件!') continue # 首先抽一个文件读取gene的列表 # 经过验证,每个featureCounts出来的文件基因的次序和个数是相同的 with open(os.path.join(all_result_path, finished_srr[0]), 'r', encoding='utf8') as f: # 读掉开始两行 f.readline() f.readline() genes_length = [] genes_list = [] for line in f: genes_length.append(int(line.strip().split('\t')[-2])) genes_list.append(line.strip().split('\t')[0]) # 多进程整合SRR成GSE的表达矩阵 print('开启%d个进程整合SRR数据!' % N_INTEGRATION) n_worker, srr_per_worker = distribute_srr(finished_srr, n_worker=N_INTEGRATION) pool = Pool(processes=n_worker) new_integration_worker = partial(integration_worker, all_result_path=all_result_path) result = pool.map(new_integration_worker, srr_per_worker) gse_data_dict = {key: every_dict[key] for every_dict in result for key in every_dict} gse_matrix = pd.DataFrame(gse_data_dict, index=genes_list) # 创建一个DataFrame写入表达矩阵 if VALUE == 'RPKM': cells_numi = gse_matrix.sum(axis=0) gse_matrix = gse_matrix.div(cells_numi, axis=1).div(genes_length, axis=0) * 10**9 elif VALUE == 'TPM': foo = gse_matrix.div(genes_length, axis=0) * 1000 foo_numi = foo.sum(axis=0) gse_matrix = foo.div(foo_numi) * 10**6 print('整合完毕,保存数据中!') with open(matrix_file, 'w', encoding='utf8') as f: # TPM、RPKM的数量级最小大概就是三位小数,所以文件保存保留三位小数 gse_matrix.to_csv(f, sep=',', header=True, index=True, float_format='%.3f') text = '保存成功:%s' % os.path.abspath(matrix_file) print(add_color(text, 'green'))