def _process_pathway(cls, download_file, stage_output_file, section, source, is_public, config=None): '''Function to parse the pathway input files eg: kegg, reactome, go INPUT file format: Pathway name \t Pathyway url \t List of entrez ids REACTOME_RNA_POL_I_TRANSCRIPTION_TERMINATION http://www.broadinstitute.org/gsea/msigdb/cards/REACTOME_RNA_POL_I_TRANSCRIPTION_TERMINATION1022 2068 2071 25885 284119 2965 2966 2967 2968 4331 The entrez ids are converted to ensembl ids and logs are written to track the conversion rates (LESS/MORE/EQUAL) ''' json_target_file_path = stage_output_file.replace(".out", ".json") json_target_file = open(json_target_file_path, mode='w', encoding='utf-8') json_target_file.write('{"docs":[\n') count = 0 tmp_row_count_file = open(download_file, encoding='utf-8') row_count = sum(1 for row in tmp_row_count_file) logger.debug('Number of lines in the file ' + str(row_count)) load_mapping = True gene_sets = [] with open(download_file, encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: gene_sets.extend(row[2:]) csvfile.close() ens_look_up = Gene._entrez_ensembl_lookup(gene_sets, section, config) with open(download_file, encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: path_object = dict() pathway_name = row[0] pathway_url = row[1] gene_sets = row[2:] converted_genesets = [ens_look_up[entrez] for entrez in gene_sets if entrez in ens_look_up] path_object["pathway_name"] = pathway_name path_object["pathway_url"] = pathway_url path_object["gene_sets"] = converted_genesets path_object["source"] = source path_object["is_public"] = is_public json_target_file.write(json.dumps(path_object)) count += 1 if row_count == count: json_target_file.write('\n') else: json_target_file.write(',\n') json_target_file.write('\n]}') logger.debug("No. genes to load "+str(count)) logger.debug("Json written to " + json_target_file_path) logger.debug("Load mappings") if load_mapping: status = cls._load_pathway_mappings(section) print(status)
def get_ensemb_ids(self, entrez_list): config = {} section = {} section['index'] = 'genes_hg38_v0.0.2' section['index_type'] = 'gene_history' config['GENE_HISTORY'] = section result_dict = Gene._entrez_ensembl_lookup(entrez_list, section, config) return result_dict
def _process_bioplex(cls, download_file, stage_output_file, section, config): '''Function to process bioplex data files. Interactors are in first two columns, they are converted to ensembl ids and stored in temperory.out files Input File format: GeneA GeneB UniprotA UniprotB SymbolA SymbolB pW pNI pInt 100 728378 P00813 A5A3E0 ADA POTEF 2.38086E-09 0.000331856 0.999668142 100 345651 P00813 Q562R1 ADA ACTBL2 9.79E-18 0.211914437 0.788085563 Output file format: interactorA interactorB ENSG00000196839 ENSG00000196604 ENSG00000196839 ENSG00000169067 ''' stage_output_file_handler = open(stage_output_file, 'w') mapped_counter = 0 unmapped_ids = [] stage_output_file_handler.write('interactorA' + '\t' + 'interactorB\n') gene_sets = [] with open(download_file, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: gene_sets.extend([row['GeneA'], row['GeneB']]) csvfile.close() ens_look_up = Gene._entrez_ensembl_lookup(gene_sets, section, config) with open(download_file, encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: interactor_a = row['GeneA'] interactor_b = row['GeneB'] if interactor_a in ens_look_up and interactor_b in ens_look_up: line = ens_look_up[interactor_a] + '\t' + ens_look_up[interactor_b] + '\n' stage_output_file_handler.write(line) mapped_counter += 1 else: line = interactor_a + '\t' + interactor_b + '\n' unmapped_ids.append(interactor_a) unmapped_ids.append(interactor_b) logger.debug("\n".join(unmapped_ids)) logger.debug("Mapped {} Unmapped {} " . format(mapped_counter, len(unmapped_ids))) stage_output_file_handler.close() cls._process_interaction_out_file(stage_output_file, section, False)