def msgf2seq_file(filepath, fasta_file, msb_psms): """ msb_psms: set of spectid_peptidesequence """ def parse_spec_pep_row(r): # get spec_pep from _best file format parsed = '_'.join(r[0].split('.')[:2] + [r[4]]) #print parsed return parsed usedir,fin = os.path.split(filepath) # Get the sample filename from the first item of the third line fout = next(it.islice(ut.load_tab_file(filepath),2,3))[0].split('.')[0] in_gen = ut.load_tab_file(filepath) in_gen.next(); in_gen.next() # skip 2 lines p2g = seqs.prots2genes(fasta_file) g2p = ut.dict_inverse(p2g) fout = os.path.join(usedir, '.'.join([fout, fin.split('.')[-1] , 'sequestformat'])) search = searches[filepath.split('.')[-1]] print "Converting/filtering; Search:", search output = (msgfbest2sequest_line(r,p2g, g2p, search) for r in in_gen if parse_spec_pep_row(r) in msb_psms) print "Writing", fout ut.write_tab_file(output, fout) return fout
def process_raw_wan( f_source, f_dest=None, first_col_element=1, first_data_col=1, end_description_col=True, first_data_row=1 ): # specific to cuihong's files, and tries to handle the differences seen in them # always keeps the first column as variable name # processes first column, splitting it and keeping first_col_element # for the array, keeps columns [first_data_col:end_data_col]. None works. # textlines = [textline for textline in open(f_source)] # # handle unix-unreadable linebreaks from excel # if len(textlines) == 1: # if textlines[0].find('\r\n') > -1: # textlines = textlines[0].split('\r\n') lines = [line.strip().split("\t") for line in open(f_source) if line.strip() != ""] # simple: one step at a time. # column manipulation first. if end_description_col: lines = [[l[0]] + [l[-1]] + l[first_data_col:-1] for l in lines] else: lines = [[l[0]] + l[first_data_col:] for l in lines] # variable name manipulation if first_col_element is not None: # manipulate gene name in all but header row. skip anything btw header # and first_data_row. lines = [lines[0]] + [[l[0].split("|")[first_col_element]] + l[1:] for l in lines[first_data_row:]] # rename file if f_dest is None: split = os.path.splitext(f_source) f_dest = split[0] + "_proc" + split[1] ut.write_tab_file(lines, f_dest)
def process_raw_wan(f_source, f_dest=None, first_col_element=1, first_data_col=1, end_description_col=True, first_data_row=1): # specific to cuihong's files, and tries to handle the differences seen in them # always keeps the first column as variable name # processes first column, splitting it and keeping first_col_element # for the array, keeps columns [first_data_col:end_data_col]. None works. # textlines = [textline for textline in open(f_source)] # # handle unix-unreadable linebreaks from excel # if len(textlines) == 1: # if textlines[0].find('\r\n') > -1: # textlines = textlines[0].split('\r\n') lines = [line.strip().split('\t') for line in open(f_source)if line.strip()!=''] # simple: one step at a time. # column manipulation first. if end_description_col: lines = [[l[0]] + [l[-1]] + l[first_data_col:-1] for l in lines] else: lines = [[l[0]] + l[first_data_col:] for l in lines] # variable name manipulation if first_col_element is not None: # manipulate gene name in all but header row. skip anything btw header # and first_data_row. lines = [lines[0]] + [[l[0].split('|')[first_col_element]] + l[1:] for l in lines[first_data_row:]] # rename file if f_dest is None: split = os.path.splitext(f_source) f_dest = split[0] + '_proc' + split[1] ut.write_tab_file(lines, f_dest)
def elut_p2g(fname, p2g, suffix='_fix'): lines = ut.load_tab_file(fname) def process(lines): for items in lines: if items[0][0] != '#': yield [p2g[items[0]]] + list(items[1:]) else: yield items ut.write_tab_file(process(lines), fname+suffix)
def cuihong_fasta_to_clean(fname, outname): """ Get rid of all the reverse or shuffleds ('rm' instead of 'sp') and anything else that doesn't start with '>sp'. Keep only the uniprot identifier. """ lol = _load_prots_to_lol(fname) good_seqs = [['>'+p[0].split('|')[1]]+p[1:] for p in lol if p[0][:3] == '>sp' or p[0][:3]== '>tr'] ut.write_tab_file([i for l in good_seqs for i in l ], outname, islist=True) #flatten
def multi_identities(input_fname, out_dir): input_list = ut.load_lol(input_fname) for desc, prots_fname, source_fasta, odict, target in input_list: print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc, prots_fname, source_fasta, odict, target) prots = ut.load_list(prots_fname) sims = all_identities(prots, odict, source_fasta, target) out_fname = os.path.join(out_dir, ut.shortname(target).split('.')[0] + "_" + desc + ".txt") ut.write_tab_file(sims, out_fname, islist=True)
def mq2elut(fname, quant='iBAQ'): lines = [l for l in ut.load_tab_file(fname)] # want eg 'iBAQ WAN...', not 'iBAQ L WAN...' inds = [i for i,val in enumerate(lines[0]) if re.match('^%s\s\w{2}' % quant,val) is not None] #prots = [[p.split()[0][1:] for p in ps.split(';')] #for ps in [l[0] for l in lines[1:]]] # for now just using the "majority protein" prots = [p.split()[0][1:] for p in [l[1] for l in lines[1:]]] output = [[lines[0][0]] + [lines[0][i] for i in inds]] + \ [[p] + [l[i] for i in inds] for p,l in zip(prots, lines[1:])] ut.write_tab_file(output, ut.pre_ext(fname, '_mq_%s' % quant))
def orth_pid2geneid(fname, p2g): lines = ut.load_tab_file(fname) def process(lines): def replistp2g(pclist): return ' '.join([el if i%2 else p2g[el] for i,el in enumerate(pclist)]) for n,items in enumerate(lines): if n==1: yield items else: newitems = list(items[:2]) for i in 2,3: newitems.append(replistp2g(items[i].split())) yield newitems ut.write_tab_file(process(lines), fname+'_fix')
def write_elution(elut, fname, forR=False): """ Write out an elution in the spcount format $ProtID\tTotalCount\tCol1.... """ # First eliminate empty protein rows nonzeros = np.sum(np.array(elut.mat), axis=1) > 0 arr = np.array(elut.mat[nonzeros, :]) prots = list(np.array(elut.prots)[nonzeros]) if not forR: header = "#ProtID TotalCount".split() + elut.fractions data = [[prots[i], np.sum(arr[i, :])] + arr[i, :].tolist() for i in range(len(prots))] else: # R: no column header for first column, and transpose header = prots data = [[elut.fractions[i]] + arr[:, i].tolist() for i in range(len(elut.fractions))] ut.write_tab_file([header] + data, fname)
def export_idconvert(ppis, dict_cxlabels, fname): cxs_labeled = set([]) pfx_convert = [] for p in ppis: for i in 0,1: combid = p[i] cxid = combid.split('_')[0] pid = '_'.join(combid.split('_')[1:]) #in case '_' in id, eg for Sp if cxid not in cxs_labeled: cxlabel = dict_cxlabels[cxid] cxs_labeled.add(cxid) else: cxlabel = '' pfx_convert.append([combid, pid, cxid, cxlabel]) pfx_convert = [['nodeid', 'ENSGID', 'complexid', 'ComplexLabel']] \ + pfx_convert ut.write_tab_file(pfx_convert, ut.pre_ext(fname,'pfx_convert'))
def write_elution(elut, fname, forR=False): """ Write out an elution in the spcount format $ProtID\tTotalCount\tCol1.... """ # First eliminate empty protein rows nonzeros = np.sum(np.array(elut.mat),axis=1)>0 arr = np.array(elut.mat[nonzeros,:]) prots = list(np.array(elut.prots)[nonzeros]) if not forR: header = "#ProtID TotalCount".split() + elut.fractions data = [[prots[i], np.sum(arr[i,:])] + arr[i,:].tolist() for i in range(len(prots))] else: #R: no column header for first column, and transpose header = prots data = [[elut.fractions[i]] + arr[:,i].tolist() for i in range(len(elut.fractions))] ut.write_tab_file([header] + data, fname)
def munge_original(fdata, column_inds, fnames, fout, first_names=1): """ Keep selected columns, replace 'NA' with '?', remove empty rows. Ids (first 2 columns) are kept automatically. For column inds, start with 0 for scores. Keep the same columns from the fnames file so I have a record of it. """ out = [] default = ['?'] * len(column_inds) for l in ut.load_tab_file(fdata): ids = list(l[:2]) newdata = [l[i+2] if l[i+2]!='NA' else '?' for i in range(len(l)) if i in column_inds] if newdata != default: out.append(ids + newdata) ut.write_tab_file(out, fout) names = [l for i,l in enumerate( list( ut.load_tab_file( fnames))[first_names:]) if i in column_inds] ut.write_tab_file(names, ut.pre_ext(fout, '_names'))
def transpose(d, fin, fout): sys.path.append(d+'/..') import utils as ut lines = [l for l in ut.load_tab_file(fin)] if lines[-1][0].startswith('#'): #ignore comments, such as last line in spcount output lines = lines[:-1] print "skipping last line" cols = ut.zip_exact(*lines) #zip messes up if these files aren't neat # _After_ zipping, get rid of the column 1 header--R doesn't like it. col0list = list(cols[0]) print col0list[0][0] assert (col0list[0][0] == '#' or col0list[0] == 'Locus') # make sure we're removing what we should be col0list.remove(col0list[0]) cols[0] = tuple(col0list) col2title = cols[1][0].lower() # get rid of the total/descr column if col2title.find('total') > -1 or col2title.find('descr') > -1: cols.remove(cols[1]) print "removing second column--extraneous" ut.write_tab_file(cols, fout)
def exported_diff(cy_basefile, cy_difffile, col_header, diff_ppis=None, justids=False): """ Makes a new cy_ file labeling whether that interaction is also found in the cy_difffile (or the diff_ppis--pass None for cy_difffile in that case). """ def cy_ppi_to_pair(p): return (p[0].split('_')[1], p[1].split('_')[1]) if cy_difffile is not None: pd_diff = pd.PairDict([cy_ppi_to_pair(p) for p in ut.load_lot(cy_difffile)[1:]]) else: pd_diff = pd.PairDict(diff_ppis) header = ut.load_lol(cy_basefile)[0] lines = ut.load_lol(cy_basefile)[1:] if justids: lines = [l[:2] for l in lines] header = header[:2] header += [col_header] ut.write_tab_file([r + [pd_diff.contains(cy_ppi_to_pair(r))] for r in lines], ut.pre_ext(cy_basefile, col_header), header=header)
def ensembl_prots_to_genes(fname, bar_split=None, second_split=False, only_geneid_on_line=False, pid_replace=False): """ Take a protein sequence file and keep only the longest sequence for each gene. Designed for ensembl fasta sequence downloads. Purpose is to run inparanoid for orthology only on the longest sequence per gene so as to have gene-based orthology, which is cleaner to understand. bar_split: use 1 for Dd, Sp, leave out for standard ensembl for Sp, use second_split=True pid_replace only works the first time--don't try it again after replacement """ genes_dict = _longest_seqs_dep(fname, bar_split, only_geneid_on_line=only_geneid_on_line) if pid_replace: for geneid, lines in genes_dict.items(): items = lines[0].split(' ') protid = items[0].strip().strip('>') items[0] = '>' + geneid items.append('protein:' + protid) lines[0] = ' '.join(items) genes_list = reduce(operator.add,[lines for g,lines in genes_dict.items()]) ut.write_tab_file(genes_list, fname+'_longest', islist=True)
def write_combined(fnames): output = keep_unique_lines_by_column(fnames) fout = '.'.join(fnames[0].split('.')[:-2]) + '.combined' ut.write_tab_file(output,fout) return fout
def export_cxs(tested, fname, negmult): ut.write_tab_file([(t[0], t[1], ut.rescale(float(t[2]),negmult)) for t in tested], fname)
def export_ints(tested, fname, negmult, header): ut.write_tab_file([header] + [[t[0], t[1], ut.rescale(float(t[2]),negmult)] + list(t[3:]) for t in tested], fname)