] d.pop(0) # remove the header data = set(d) print "%d known genes found (using gene symbols)" % (len(data), ) # read the gene symbols file_symbols = os.path.join(options.output_directory, 'synonyms.txt') loci = symbols.generate_loci(file_symbols) genes = symbols.read_genes_symbols(file_symbols) d = [] for g in data: ens = symbols.ensembl(g.upper(), genes, loci) if ens: d.extend(ens) data = [line + '\n' for line in d] data = sorted(set(data)) print "%d known genes found (after conversion to Ensembl ids)" % ( len(data), ) file(os.path.join(options.output_directory, 'oncogenes_more.txt'), 'w').writelines(data) if os.path.exists(tmp_file): os.remove(tmp_file)
] file(os.path.join(options.output_directory, 'version.txt'), 'a').writelines(txt) # # read the gene symbols file_symbols = os.path.join(options.output_directory, 'synonyms.txt') loci = symbols.generate_loci(file_symbols) genes = symbols.read_genes_symbols(file_symbols) d = [] for (g1, g2) in data: if g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(), genes, loci) ens2 = symbols.ensembl(g2.upper(), genes, loci) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 != e2: d.append([e1, e2]) data = ['\t'.join(sorted(line)) + '\n' for line in d] data = sorted(set(data)) print "%d known fusion genes found" % (len(data), ) if not options.skip_filter_overlap: ens2hugo = dict([ tuple(line.rstrip('\r\n').split('\t')) for line in file(
# read the gene symbols file_symbols = os.path.join(options.output_directory, 'synonyms.txt') genes = symbols.read_genes_symbols(file_symbols) banned = set() loci = symbols.generate_loci(file_symbols) #for v in symbols.locus.values(): for v in loci.values(): if v: n = len(v) if n > 1: for i in xrange(n - 1): for j in xrange(i + 1, n): if v[i].upper() != v[j].upper(): ens1 = symbols.ensembl( v[i].upper(), genes, loci) ens2 = symbols.ensembl( v[j].upper(), genes, loci) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 != e2: (e1, e2) = ( e2, e1 ) if e2 < e1 else (e1, e2) banned.add((e1, e2)) d = [] for (g1, g2) in fusions: if (g1.upper() == g2.upper()
# save version of txt = ['Non-cancer tissues and cells (Babiceanu et al. Nucl. Acids Res. 2016) database version: %s\n' % (today.strftime("%Y-%m-%d"),)] file(os.path.join(options.output_directory,'version.txt'),'a').writelines(txt) # # read the gene symbols file_symbols = os.path.join(options.output_directory,'synonyms.txt') loci = symbols.generate_loci(file_symbols) genes = symbols.read_genes_symbols(file_symbols) d = [] for (g1,g2) in data: if g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(),genes,loci) ens2 = symbols.ensembl(g2.upper(),genes,loci) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 != e2: d.append([e1,e2]) data = ['\t'.join(sorted(line)) + '\n' for line in d] data = sorted(set(data)) print "%d known fusion genes found" % (len(data),) if not options.skip_filter_overlap: ens2hugo = dict([tuple(line.rstrip('\r\n').split('\t')) for line in file(os.path.join(options.output_directory,'genes_symbols.txt'),'r').readlines() if line.rstrip('\r\n')])
# read the gene symbols file_symbols = os.path.join(options.output_directory,'synonyms.txt') genes = symbols.read_genes_symbols(file_symbols) banned = set() loci = symbols.generate_loci(file_symbols) #for v in symbols.locus.values(): for v in loci.values(): if v: n = len(v) if n > 1: for i in xrange(n-1): for j in xrange(i+1,n): if v[i].upper() != v[j].upper(): ens1 = symbols.ensembl(v[i].upper(),genes,loci) ens2 = symbols.ensembl(v[j].upper(),genes,loci) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 != e2: (e1,e2) = (e2,e1) if e2 < e1 else (e1,e2) banned.add((e1,e2)) d = [] for (g1,g2) in fusions: if ( g1.upper() == g2.upper() or ((g1.endswith('@') and g2.endswith('@')) and g1.upper()[:2] == g2.upper()[:2])): print "%s-%s skipped!" % (g1,g2) continue ens1 = symbols.ensembl(g1,genes,loci)
] data = enlarge.get(options.organism.lower(),[]) if data: #file_symbols = os.path.join(options.output_directory,'genes_symbols.txt') file_symbols = os.path.join(options.output_directory,'synonyms.txt') loci = symbols.generate_loci(file_symbols) genes = symbols.read_genes_symbols(file_symbols) d = [] for g in data: ens = symbols.ensembl(g.upper(),genes,loci) if ens: for e in ens: d.append(e) else: print " - Original:",g data = [line + '\n' for line in d] data = sorted(set(data)) print "%d genes to be enlarged and covered" % (len(data),) file(os.path.join(options.output_directory,'enlarge.txt'),'w').writelines(data) #
data = [] if mygenes: file_symbols1 = os.path.join(os.path.dirname(options.output),'genes_symbols.txt') file_symbols2 = os.path.join(os.path.dirname(options.output),'synonyms.txt') loci1 = symbols.generate_loci(file_symbols1) loci2 = symbols.generate_loci(file_symbols2) genes1 = symbols.read_genes_symbols(file_symbols1) genes2 = symbols.read_genes_symbols(file_symbols2) d = [] for (g1,g2) in mygenes: if g1 and g2 and g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(),genes1,loci1) ens2 = symbols.ensembl(g2.upper(),genes1,loci1) if not ens1: ens1 = symbols.ensembl(g1.upper(),genes2,loci2) if not ens2: ens2 = symbols.ensembl(g2.upper(),genes2,loci2) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 and e2 and e1 != e2: d.append([e1,e2]) data = ['\t'.join(sorted(line)) + '\n' for line in d] data = list(set(data)) data = sorted(data)
file_symbols1 = os.path.join(os.path.dirname(options.output), 'genes_symbols.txt') file_symbols2 = os.path.join(os.path.dirname(options.output), 'synonyms.txt') loci1 = symbols.generate_loci(file_symbols1) loci2 = symbols.generate_loci(file_symbols2) genes1 = symbols.read_genes_symbols(file_symbols1) genes2 = symbols.read_genes_symbols(file_symbols2) d = [] for (g1, g2) in mygenes: if g1 and g2 and g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(), genes1, loci1) ens2 = symbols.ensembl(g2.upper(), genes1, loci1) if not ens1: ens1 = symbols.ensembl(g1.upper(), genes2, loci2) if not ens2: ens2 = symbols.ensembl(g2.upper(), genes2, loci2) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 and e2 and e1 != e2: d.append([e1, e2]) data = ['\t'.join(sorted(line)) + '\n' for line in d] data = list(set(data)) data = sorted(data)