def generate_hmm_(opts): fd, tmphmm = mkstemp() close(fd) fd, tmpaln = mkstemp() close(fd) is_dna = opts.ENCODER == DNAEncoder try: with open(opts.REFMSA) as msa_fh: with open(tmpaln, 'w') as aln_fh: msa_fmt = seqfile_format(opts.REFMSA) source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet) try: SeqIO.write((record if is_dna else translate(record) for record in source), aln_fh, 'stockholm') except VerifyError: if is_dna: raise RuntimeError( "DNA encoding incompatible with protein reference MSA" ) source.set_alphabet(AminoAlphabet) aln_fh.seek(0) SeqIO.write(source, aln_fh, 'stockholm') hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN) hmmer.build(tmphmm, tmpaln, alphabet=HMMER.DNA if is_dna else HMMER.AMINO) finally: if exists(tmpaln): remove(tmpaln) return tmphmm
def seqrecords(): is_dna = ARGS.ENCODER == DNAEncoder seq_fmt = seqfile_format(ARGS.SEQUENCES) source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet) try: for record in source: yield record if is_dna else translate(record) except VerifyError: if is_dna: msg = ( "your model specifies a DNA encoding " "which is incompatible with protein sequences" ) raise RuntimeError(msg) source.set_alphabet(AminoAlphabet) for record in source: yield record
def __call__(self, string): try: with open(string) as h: source = Verifier(SeqIO.parse(h, seqfile_format(string)), DNAAlphabet) try: seq = next(iter(source)) if not self.is_dna: seq = translate(seq) except VerifyError: if self.is_dna: raise ArgumentTypeError("DNA encoding incompatible with protein reference") source.set_alphabet(AminoAlphabet) seq = next(iter(source)) return seq except ArgumentTypeError: raise sys.exc_info()[1] except: raise ArgumentTypeError("invalid FASTA file '{0:s}'".format(string))
def __call__(self, string): try: with open(string) as h: source = Verifier(SeqIO.parse(h, seqfile_format(string)), DNAAlphabet) try: seq = next(iter(source)) if not self.is_dna: seq = translate(seq) except VerifyError: if self.is_dna: raise ArgumentTypeError( "DNA encoding incompatible with protein reference") source.set_alphabet(AminoAlphabet) seq = next(iter(source)) return seq except ArgumentTypeError: raise sys.exc_info()[1] except: raise ArgumentTypeError( "invalid FASTA file '{0:s}'".format(string))
def generate_hmm_(opts): fd, tmphmm = mkstemp(); close(fd) fd, tmpaln = mkstemp(); close(fd) is_dna = opts.ENCODER == DNAEncoder try: with open(opts.REFMSA) as msa_fh: with open(tmpaln, 'w') as aln_fh: msa_fmt = seqfile_format(opts.REFMSA) source = Verifier(SeqIO.parse(msa_fh, msa_fmt), DNAAlphabet) try: SeqIO.write( (record if is_dna else translate(record) for record in source), aln_fh, 'stockholm') except VerifyError: if is_dna: raise RuntimeError("DNA encoding incompatible with protein reference MSA") source.set_alphabet(AminoAlphabet) aln_fh.seek(0) SeqIO.write( source, aln_fh, 'stockholm') hmmer = HMMER(opts.HMMER_ALIGN_BIN, opts.HMMER_BUILD_BIN) hmmer.build( tmphmm, tmpaln, alphabet=HMMER.DNA if is_dna else HMMER.AMINO ) finally: if exists(tmpaln): remove(tmpaln) return tmphmm
def seqrecords(self, antibodies, clonal=False): conn = connect(self.__filename) cur = conn.cursor() antibodies_ = set(antibodies) ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_)) equivalencies = set((next( cur.execute( 'select distinct ALT_IDS from ANTIBODY where %s' % ab_clause, tuple(antibodies_)))[0] or '').split(',')) - set(['']) if len(equivalencies): antibodies_ |= equivalencies ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_)) antibodies__ = tuple(sorted(antibodies_)) stmt = dedent('''\ select distinct SG.NO as NO, SG.ID as ID, SG.SEQ as SEQ, SG.SUBTYPE as SUBTYPE, ? as AB, N.VALUE as VALUE from (select NO, S.ID as ID, SUBTYPE, SEQ from (select SEQUENCE_NO as NO, SEQUENCE_ID as ID, RAW_SEQ as SEQ from SEQUENCE {0:s} group by ID) as S left join (select SEQUENCE_ID as ID, SUBTYPE from GENO_REPORT group by ID) as G on S.ID = G.ID ) as SG join (select SEQUENCE_ID as ID, ANTIBODY as AB, group_concat(TYPE || ':' || VALUE, ',') as VALUE from NEUT where ({1:s}) group by ID) as N on SG.ID = N.ID order by SG.ID; '''.format('where IS_CLONAL = 1' if clonal else '', ab_clause)) params = ('+'.join(antibodies__), ) + antibodies__ cur.execute(stmt, params) def records(): ids = {} for row in cur: nno, sid, seq, subtype, ab, values = row[:6] values_ = {} for kv in values.split(','): k, v = kv.split(':') try: v_ = float(v.strip().lstrip('<>')) except ValueError: continue if k not in values_: values_[k] = [] values_[k].append(v_) if len(values_) == 0: warn("skipping sequence '%s', invalid values '%s'" % (sid, values)) continue record = SeqRecord(Seq( OrfList(seq, include_stops=False)[0], DNAAlphabet), id=sid, description=json_dumps({ 'subtype': '' if subtype is None else subtype, 'ab': ab, 'values': values_ }), annotations={ 'antibody': values_, 'subtype': subtype }) if sid in ids: record.id += str(-ids[sid]) ids[sid] += 1 else: ids[sid] = 1 yield record source = Verifier(records(), DNAAlphabet) try: seqrecords = list(source) except VerifyError: source.set_alphabet(AminoAlphabet) seqrecords = list(source) conn.close() return seqrecords, clonal, antibodies__
def seqrecords(self, antibodies, clonal=False): if clonal: raise ValueError( 'clonal property is not available with Monogram datasets') if len(antibodies) > 1: raise ValueError( 'only one antibody can be interrogated with Monogram datasets') seqrecords = [] with open(self.__fastafile) as h: source = Verifier(SeqIO.parse(h, 'fasta'), DNAAlphabet) try: seqrecords = list(source) except VerifyError: source.set_alphabet(AminoAlphabet) seqrecords = list(source) underdash = re_compile(r'[_-](\d+)$') for r in seqrecords: r.id = underdash.sub(r'_\1', r.id) ic50s = dict((r.id, []) for r in seqrecords) with open(self.__csvfile) as fh: sample = fh.read(MonogramData.__sample_len) sniffer = csv_sniffer() dialect = sniffer.sniff(sample) if not sniffer.has_header(sample): raise ValueError(MonogramData.__no_header_msg) fh.seek(0) reader = csv_reader(fh, dialect) columns = None for i, row in enumerate(reader): if columns is None: columns = dict((v.strip(), j) for j, v in enumerate(row)) missing = set(antibodies) - set(columns.keys()) if len(missing): raise ValueError("antibodies ('%s') not found!" % "', '".join(missing)) else: acc = underdash.sub(r'_\1', row[0]) try: if acc in ic50s: cln_ic50s = [ float(row[columns[ab]].strip().lstrip('<>')) for ab in antibodies if ab in columns and columns[ab] < len(row) ] ic50s[acc].extend(cln_ic50s) except: pass drop = [] for i, r in enumerate(seqrecords): if r.id not in ic50s or len(ic50s[r.id]) == 0: drop.append(i) warn("skipping sequence '%s', VALUE not found" % r.id) else: values = {'IC50': ic50s[r.id]} r.description = json_dumps({ 'ab': antibodies[0], 'values': values }) r.annotations['antibody'] = values for i in sorted(drop, reverse=True): del seqrecords[i] return seqrecords, clonal, antibodies
def seqrecords(self, antibodies, clonal=False): conn = connect(self.__filename) cur = conn.cursor() antibodies_ = set(antibodies) ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_)) equivalencies = set(( next(cur.execute( 'select distinct ALT_IDS from ANTIBODY where %s' % ab_clause, tuple(antibodies_) ))[0] or '' ).split(',')) - set(['']) if len(equivalencies): antibodies_ |= equivalencies ab_clause = ' or '.join(['ANTIBODY = ?'] * len(antibodies_)) antibodies__ = tuple(sorted(antibodies_)) stmt = dedent('''\ select distinct SG.NO as NO, SG.ID as ID, SG.SEQ as SEQ, SG.SUBTYPE as SUBTYPE, ? as AB, N.VALUE as VALUE from (select NO, S.ID as ID, SUBTYPE, SEQ from (select SEQUENCE_NO as NO, SEQUENCE_ID as ID, RAW_SEQ as SEQ from SEQUENCE {0:s} group by ID) as S left join (select SEQUENCE_ID as ID, SUBTYPE from GENO_REPORT group by ID) as G on S.ID = G.ID ) as SG join (select SEQUENCE_ID as ID, ANTIBODY as AB, group_concat(TYPE || ':' || VALUE, ',') as VALUE from NEUT where ({1:s}) group by ID) as N on SG.ID = N.ID order by SG.ID; '''.format('where IS_CLONAL = 1' if clonal else '', ab_clause)) params = ('+'.join(antibodies__),) + antibodies__ cur.execute(stmt, params) def records(): ids = {} for row in cur: nno, sid, seq, subtype, ab, values = row[:6] values_ = {} for kv in values.split(','): k, v = kv.split(':') try: v_ = float(v.strip().lstrip('<>')) except ValueError: continue if k not in values_: values_[k] = [] values_[k].append(v_) if len(values_) == 0: warn("skipping sequence '%s', invalid values '%s'" % (sid, values)) continue record = SeqRecord( Seq(OrfList(seq, include_stops=False)[0], DNAAlphabet), id=sid, description=json_dumps({ 'subtype': '' if subtype is None else subtype, 'ab': ab, 'values': values_ }), annotations={'antibody': values_, 'subtype': subtype} ) if sid in ids: record.id += str(-ids[sid]) ids[sid] += 1 else: ids[sid] = 1 yield record source = Verifier(records(), DNAAlphabet) try: seqrecords = list(source) except VerifyError: source.set_alphabet(AminoAlphabet) seqrecords = list(source) conn.close() return seqrecords, clonal, antibodies__
def seqrecords(self, antibodies, clonal=False): if clonal: raise ValueError('clonal property is not available with Monogram datasets') if len(antibodies) > 1: raise ValueError('only one antibody can be interrogated with Monogram datasets') seqrecords = [] with open(self.__fastafile) as h: source = Verifier(SeqIO.parse(h, 'fasta'), DNAAlphabet) try: seqrecords = list(source) except VerifyError: source.set_alphabet(AminoAlphabet) seqrecords = list(source) underdash = re_compile(r'[_-](\d+)$') for r in seqrecords: r.id = underdash.sub(r'_\1', r.id) ic50s = dict((r.id, []) for r in seqrecords) with open(self.__csvfile) as fh: sample = fh.read(MonogramData.__sample_len) sniffer = csv_sniffer() dialect = sniffer.sniff(sample) if not sniffer.has_header(sample): raise ValueError(MonogramData.__no_header_msg) fh.seek(0) reader = csv_reader(fh, dialect) columns = None for i, row in enumerate(reader): if columns is None: columns = dict((v.strip(), j) for j, v in enumerate(row)) missing = set(antibodies) - set(columns.keys()) if len(missing): raise ValueError("antibodies ('%s') not found!" % "', '".join(missing)) else: acc = underdash.sub(r'_\1', row[0]) try: if acc in ic50s: cln_ic50s = [float(row[columns[ab]].strip().lstrip('<>')) for ab in antibodies if ab in columns and columns[ab] < len(row)] ic50s[acc].extend(cln_ic50s) except: pass drop = [] for i, r in enumerate(seqrecords): if r.id not in ic50s or len(ic50s[r.id]) == 0: drop.append(i) warn("skipping sequence '%s', VALUE not found" % r.id) else: values = {'IC50': ic50s[r.id]} r.description = json_dumps({ 'ab': antibodies[0], 'values': values }) r.annotations['antibody'] = values for i in sorted(drop, reverse=True): del seqrecords[i] return seqrecords, clonal, antibodies