def trim_columns(sequences, opts, tmp_dir): aali_path = tmp_dir + '/aligned.fasta' write_rfasta(sequences, aali_path, what='aa_ali') trimcl_path = tmp_dir + '/trimmed.fasta' if opts.trimcol == 'specific': cmds = [BINARIES['trimal']['bin'], '-in' , aali_path, '-out', trimcl_path, '-gt' , str (opts.gaptreshold), '-st' , str (opts.similarity), '-colnumbering'] else: cmds = [BINARIES['trimal']['bin'], '-in' , aali_path, '-out', trimcl_path, '-' + opts.trimcol, '-colnumbering'] proc = Popen(cmds, stdout=PIPE, stderr=PIPE) (keeplist, err) = proc.communicate() LOG.append('') if 'ERROR' in err: exit('ERROR: trimming columns:\n' + err) keeplist = str (keeplist).strip().split(', ') algt = get_alignment(sequences, typ=('aa_ali' if opts.aa else 'codon')) nnn = compil('[A-Z]{3}') if opts.nogap: for (col, num) in zip (algt, range (len (algt))): if not str(num) in keeplist: algt[num] = [ nnn.sub('', x) for x in col ] algt[num] = [ compil('---').sub('', x) for x in algt[num]] else: for (col, num) in zip (algt, range (len (algt))): if not str(num) in keeplist: algt[num] = [ nnn.sub('NNN', x) for x in col ] for (key, seq) in zip (sorted (sequences.keys()), zip (*algt)): sequences[key]['codon'] = seq
def load_impmodel_from_cmm(f_name, rand_init=None, radius=None): ''' Loads an IMPmodel object using an cmm file of the form: :: <marker_set name="1"> <marker id="1" x="7347.50964739" y="-7743.92836303" z="-8283.39749204" r="0.00990099009901" g="0" b="0.990099009901" radius="500.0" note="1"/> <marker id="2" x="7647.90254377" y="-7308.1816344" z="-7387.75932893" r="0.019801980198" g="0" b="0.980198019802" radius="500.0" note="2"/> <link id1="1" id2="2" r="1" g="1" b="1" radius="250.0"/> </marker_set> :params f_name: path where to find the file :params None rand_init: IMP random initial number used to generate the model :param None radius: radius of each particle :return: IMPmodel ''' if not rand_init: try: rand_init = str(int(f_name.split('.')[-2])) except: rand_init = None model = IMPmodel((('x', {}), ('y', {}), ('z', {}), ('rand_init', rand_init), ('index', 0), ('objfun', 0), ('radius', radius))) expr = compil( ' x="([0-9.-]+)" y="([0-9.-]+)" z="([0-9.-]+)".* radius="([0-9.]+)"') for xxx, yyy, zzz, radius in findall(expr, open(f_name).read()): model['x'].append(float(xxx)) model['y'].append(float(yyy)) model['z'].append(float(zzz)) if not model['radius']: model['radius'] = float(radius) return model
def load_impmodel_from_xyz_OLD(f_name, rand_init=None, radius=None, chromosome='UNKNOWN', start=0, resolution=1): """ Loads an IMPmodel object using an xyz file of the form: :: p1 1 44.847 412.828 -162.673 p2 2 -55.574 396.869 -129.782 :params f_name: path where to find the file :params None rand_init: IMP random initial number used to generate the model :param None radius: radius of each particle :return: IMPmodel """ if not rand_init: try: rand_init = str(int(f_name.split('.')[-2])) except: rand_init = None model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init), ('objfun', None), ('radius', radius))) expr = compil('p[0-9]+\s+[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)') for xxx, yyy, zzz in findall(expr, open(f_name).read()): model['x'].append(float(xxx)) model['y'].append(float(yyy)) model['z'].append(float(zzz)) model['description'] = {'chromosome':chromosome, 'start': start, 'resolution': resolution} return model
def write(self, outfile=None, item='seq', reverse=False, width=60, descr=False): """ Write sequence object to file in fasta format :argument None outfile: path to outfile, if None than, print to stdout :argument seq item: what to put in place of sequence :argument False reverse: wether to reverse or not the sequence :argument 60 width: number of sites per line when printing sequence :argument False descr: put description of sequence also, not recommended if you are not sure how the aligner will read it. """ if outfile: out = open (outfile, 'w') else: out = stdout wsub = compil('([A-Za-z-]{'+str(width)+'})') for elt in self: if decr: out.write ('>%s |%s\n' % (elt, self[elt]['descr'])) else: out.write ('>%s\n' % (elt)) seq = self[elt][item][::-1] if reverse else self[elt][item] seq = seq if type(seq) is str else ''.join(seq) out.write ('%s\n' % (sub(wsub, '\\1\n', seq))) if outfile: out.close()
def load_impmodel_from_cmm(f_name, rand_init=None, radius=None): ''' Loads an IMPmodel object using an cmm file of the form: :: <marker_set name="1"> <marker id="1" x="7347.50964739" y="-7743.92836303" z="-8283.39749204" r="0.00990099009901" g="0" b="0.990099009901" radius="500.0" note="1"/> <marker id="2" x="7647.90254377" y="-7308.1816344" z="-7387.75932893" r="0.019801980198" g="0" b="0.980198019802" radius="500.0" note="2"/> <link id1="1" id2="2" r="1" g="1" b="1" radius="250.0"/> </marker_set> :params f_name: path where to find the file :params None rand_init: IMP random initial number used to generate the model :param None radius: radius of each particle :return: IMPmodel ''' if not rand_init: try: rand_init = str(int(f_name.split('.')[-2])) except: rand_init = None model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init), ('index', 0), ('objfun', 0), ('radius', radius))) expr = compil( ' x="([0-9.-]+)" y="([0-9.-]+)" z="([0-9.-]+)".* radius="([0-9.]+)"') for xxx, yyy, zzz, radius in findall(expr, open(f_name).read()): model['x'].append(float(xxx)) model['y'].append(float(yyy)) model['z'].append(float(zzz)) if not model['radius']: model['radius'] = float(radius) return model
def read_fasta(infile): ''' read file in fasta format and yield each sequence ''' nam = None descr = None seq = '' blank_re = compil('[ \t]') for line in open(infile): line = line.strip() if line.startswith('>'): if nam is not None: if seq == '': print >> stderr, 'ERROR: no sequence for ', str(nam) exit() yield { 'name' : nam, 'descr' : descr, 'seq' : seq } items = blank_re.split(line, maxsplit=1) nam = items[0].lstrip('>') descr = items[1] if len (items) == 2 else None seq = '' continue seq += blank_re.sub('', line) if seq == '' and nam is not None: print >> stderr, 'ERROR: no sequence for ', str(nam) exit() elif seq == '': print >> stderr, 'ERROR: presence of repeated names' exit() yield { 'name' : nam, 'descr' : descr, 'seq' : seq }
def write(self, outfile=None, item='seq', reverse=False, width=60, descr=False): """ Write sequence object to file in fasta format :argument None outfile: path to outfile, if None than, print to stdout :argument seq item: what to put in place of sequence :argument False reverse: wether to reverse or not the sequence :argument 60 width: number of sites per line when printing sequence :argument False descr: put description of sequence also, not recommended if you are not sure how the aligner will read it. """ if outfile: out = open(outfile, 'w') else: out = stdout wsub = compil('([A-Za-z-]{' + str(width) + '})') for elt in self: if decr: out.write('>%s |%s\n' % (elt, self[elt]['descr'])) else: out.write('>%s\n' % (elt)) seq = self[elt][item][::-1] if reverse else self[elt][item] seq = seq if type(seq) is str else ''.join(seq) out.write('%s\n' % (sub(wsub, '\\1\n', seq))) if outfile: out.close()
def trim_columns(sequences, opts, tmp_dir): aali_path = tmp_dir + '/aligned.fasta' write_rfasta(sequences, aali_path, what='aa_ali') trimcl_path = tmp_dir + '/trimmed.fasta' if opts.trimcol == 'specific': cmds = [ BINARIES['trimal']['bin'], '-in', aali_path, '-out', trimcl_path, '-gt', str(opts.gaptreshold), '-st', str(opts.similarity), '-colnumbering' ] else: cmds = [ BINARIES['trimal']['bin'], '-in', aali_path, '-out', trimcl_path, '-' + opts.trimcol, '-colnumbering' ] proc = Popen(cmds, stdout=PIPE, stderr=PIPE) (keeplist, err) = proc.communicate() LOG.append('') if 'ERROR' in err: exit('ERROR: trimming columns:\n' + err) keeplist = str(keeplist).strip().split(', ') algt = get_alignment(sequences) nnn = compil('[A-Z]{3}') if opts.nogap: for (col, num) in zip(algt, range(len(algt))): if not str(num) in keeplist: algt[num] = [nnn.sub('', x) for x in col] algt[num] = [compil('---').sub('', x) for x in algt[num]] else: for (col, num) in zip(algt, range(len(algt))): if not str(num) in keeplist: algt[num] = [nnn.sub('NNN', x) for x in col] for (key, seq) in zip(sorted(sequences.keys()), zip(*algt)): sequences[key]['codon'] = seq
def load_impmodel_from_xyz(f_name, rand_init=None, radius=None): """ Loads an IMPmodel object using an xyz file of the form: :: # ID : some identifier # SPECIES : None # CELL TYPE : None # EXPERIMENT TYPE : Hi-C # RESOLUTION : 10000 # ASSEMBLY : None # CHROMOSOME : 19 # START : 1 # END : 50 1 19:1-10000 44.847 412.828 -162.673 2 19:10001-20000 -55.574 396.869 -129.782 :params f_name: path where to find the file :params None rand_init: IMP random initial number used to generate the model :param None radius: radius of each particle :return: IMPmodel """ if not rand_init: try: rand_init = str(int(f_name.split('.')[-2])) except: rand_init = None model = IMPmodel( (('x', []), ('y', []), ('z', []), ('rand_init', rand_init), ('index', 0), ('objfun', 0), ('radius', radius))) expr = compil( '[0-9]+\s[A-Za-z0-9_ ]+:[0-9]+-[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)' ) model['description'] = {} for line in open(f_name): if line.startswith('# '): key, val = line.strip('# ').split(':') model['description'][key.strip().lower()] = val.strip() for xxx, yyy, zzz in findall(expr, open(f_name).read()): model['x'].append(float(xxx)) model['y'].append(float(yyy)) model['z'].append(float(zzz)) return model
def load_impmodel_from_xyz(f_name, rand_init=None, radius=None): """ Loads an IMPmodel object using an xyz file of the form: :: # ID : some identifier # SPECIES : None # CELL TYPE : None # EXPERIMENT TYPE : Hi-C # RESOLUTION : 10000 # ASSEMBLY : None # CHROMOSOME : 19 # START : 1 # END : 50 1 19:1-10000 44.847 412.828 -162.673 2 19:10001-20000 -55.574 396.869 -129.782 :params f_name: path where to find the file :params None rand_init: IMP random initial number used to generate the model :param None radius: radius of each particle :return: IMPmodel """ if not rand_init: try: rand_init = str(int(f_name.split('.')[-2])) except: rand_init = None model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init), ('index', 0), ('objfun', 0), ('radius', radius))) expr = compil('[0-9]+\s[A-Za-z0-9_ ]+:[0-9]+-[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)') model['description'] = {} for line in open(f_name): if line.startswith('# '): key, val = line.strip('# ').split(':') model['description'][key.strip().lower()] = val.strip() for xxx, yyy, zzz in findall(expr, open(f_name).read()): model['x'].append(float(xxx)) model['y'].append(float(yyy)) model['z'].append(float(zzz)) return model
def main(): ''' main function when called by command line. ''' opts = get_options() log = '\n\n' gencode = _set_code(opts.code) seqs = {} for seq in read_fasta(opts.fastafile): seq['trseq'] = translate(seq['seq'], gencode, stop=opts.remove_stop) seqs[seq['name']] = seq log += ' ' + str (len (seqs)) + ' sequences\n\n' prot_path = opts.outfile + '_prot' aali_path = opts.outfile + '_aa_ali' ali_path = opts.outfile + '_ali' trimsq_path = opts.outfile + '_trimseq' trimcl_path = opts.outfile + '_trimcol' score_path = opts.outfile + '_score' map_path = opts.outfile + '_map' todel = [prot_path] write_fasta(seqs, prot_path, clean=True, typ='trseq') if opts.only_translate: exit() ########### # ALIGN if not opts.input_ali: proc = Popen([opts.muscle_bin, '-quiet', '-noanchors', '-maxiters' , '999', '-maxhours' , '24 ', '-maxtrees' , '100', '-in' , prot_path, '-out' , aali_path, '-scorefile', score_path # must be last!!! because option... ][:None if opts.score else -2], stdout=PIPE) if proc.communicate()[1] is not None: print >> stderr, proc.communicate()[0] exit('\nERROR: runninge muscle') log += ' Muscle command line: \n' + \ ' '.join([opts.muscle_bin, '-quiet', '-noanchors', '-maxiters' , \ '999', '-maxhours', '24 ', '-maxtrees', '100', '-in', \ prot_path, '-out', aali_path, '-scorefile', \ score_path][:None if opts.score else -2]) + '\n\n' else: proc = Popen(['cp', prot_path, aali_path], stdout=PIPE) if proc.communicate()[1] is not None: print >> stderr, proc.communicate()[0] exit('\nERROR: when skipping muscle.') ########### # TRIM SEQS if opts.trimseq != False: todel.append(trimsq_path) proc = Popen([opts.trimal_bin, '-in' , aali_path, '-out' , trimsq_path, '-resoverlap', str (opts.trimseq[1]), '-seqoverlap', str (opts.trimseq[2]), '-cons' , '100' ], stdout=PIPE) if proc.communicate()[1] is not None: print >> stderr, proc.communicate()[0] exit('\nERROR: runninge muscle') for seq in read_fasta(trimsq_path): seqs[seq['name']]['ali'] = seq['seq'] trimmed = filter (lambda x: not seqs[x].has_key('ali'), seqs) if not opts.quiet: print >> stderr, 'WARNING: trimmed sequences: \n\t' + \ '\n\t'.join(trimmed) if len (trimmed) > 0: log += '->trimmed sequences: \n\t' + \ '\n\t'.join(trimmed) + '\n' else: log += '->no trimmed sequences\n' for s in seqs.keys(): if s in trimmed: del(seqs[s]) aali_path = trimsq_path log += ' Trimal (sequences) command line: \n' + \ ' '.join([opts.trimal_bin, '-in', aali_path, '-out', trimsq_path, '-resoverlap', str (opts.trimseq[1]), \ '-seqoverlap', str (opts.trimseq[2]), '-cons', '100']) \ + '\n\n' else: for seq in read_fasta(aali_path): seqs[seq['name']]['ali'] = seq['seq'] ########### # CODON MAP seqs = map2codons(seqs, opts.input_ali) ########### # TRIM COLS if opts.trimcol != 'None': if opts.trimcol == 'specific': todel.append(trimcl_path) proc = Popen([opts.trimal_bin, '-in' , aali_path, '-out', trimcl_path, '-gt' , str (opts.gaptreshold), '-st' , str (opts.similarity), '-colnumbering' ], stdout=PIPE) (keeplist, err) = proc.communicate() if err is not None: exit('ERROR: trimming columns.') log += ' Trimal (columns) command line: \n' + \ ' '.join([opts.trimal_bin, '-in', aali_path, '-out', trimcl_path, '-gt', str (opts.gaptreshold), '-st', str (opts.similarity), '-colnumbering' ]) + '\n' else: todel.append(trimcl_path) proc = Popen([opts.trimal_bin, '-in' , aali_path, '-out', trimcl_path, '-' + opts.trimcol, '-colnumbering' ], stdout=PIPE) (keeplist, err) = proc.communicate() if err is not None: exit('ERROR: trimming columns.') log += ' Trimal (columns) command line: \n' + \ ' '.join([opts.trimal_bin, '-in' , aali_path, '-out', trimcl_path, '-' + opts.trimcol, \ '-colnumbering']) + '\n' keeplist = str (keeplist).strip().split(', ') algt = get_alignment(seqs) nnn = compil('[A-Z]{3}') if opts.nogap: for (col, num) in zip (algt, range (len (algt))): if not str(num) in keeplist: algt[num] = map (lambda x: nnn.sub('', x), col) algt[num] = map (lambda x: compil('---').sub('', x), algt[num]) else: for (col, num) in zip (algt, range (len (algt))): if not str(num) in keeplist: algt[num] = map (lambda x: nnn.sub('NNN', x), col) for (key, seq) in zip (sorted (seqs.keys()), zip (*algt)): seqs[key]['codons'] = ''.join(seq) ########### # SEQ MAP if opts.printmap: _printmap(seqs, map_path, opts.pymap) write_fasta(seqs, ali_path, clean=opts.clean, typ='codons') Popen(['rm', '-f'] + todel, stdout=PIPE) if opts.print_log: print log