def parse_team_stats(self, homeaway='home'): # Parse each team's stats, relative to a single game team_stats = {} batting = self.boxscore.find('batting', team_flag=homeaway) pitching = self.boxscore.find('pitching', team_flag=homeaway) team_stats['game_id'] = self.game_id team_stats['team_id'] = try_int(self.boxscore.get(homeaway + '_id')) team_stats['at_home'] = (homeaway == 'home') games_back_text = self.linescore.get(homeaway + '_games_back') games_back_wildcard_text = self.linescore.get(homeaway + '_games_back') # If a team is 0 games back, they'll be listed as '-'. The # games_back_wildcard is sometimes '-', sometimes missing in this case. # If they're 0 games back, set both to 0 if games_back_text == '-': team_stats['games_back'] = 0 team_stats['games_back_wildcard'] = 0 elif games_back_wildcard_text == '-': team_stats['games_back_wildcard'] = 0 team_stats['games_back'] = try_float(games_back_text) else: team_stats['games_back'] = try_float(games_back_text) team_stats['games_back_wildcard'] = try_float( games_back_wildcard_text) wins = try_int(self.boxscore.get(homeaway + '_wins', 0)) losses = try_int(self.boxscore.get(homeaway + '_loss', 0)) team_stats['wins'] = wins team_stats['losses'] = losses team_stats['winrate'] = 0 if (wins + losses) == 0 else wins / (wins + losses) team_stats['avg'] = try_float(batting.get('avg')) team_stats['at_bats'] = try_int(batting.get('ab')) team_stats['runs'] = try_int(batting.get('r')) team_stats['hits'] = try_int(batting.get('h')) team_stats['doubles'] = try_int(batting.get('d')) team_stats['triples'] = try_int(batting.get('t')) team_stats['home_runs'] = try_int(batting.get('hr')) team_stats['rbis'] = try_int(batting.get('rbi')) team_stats['walks'] = try_int(batting.get('bb')) team_stats['putouts'] = try_int(batting.get('po')) team_stats['da'] = try_int(batting.get('da')) team_stats['strikeouts'] = try_int(batting.get('so')) team_stats['left_on_base'] = try_int(batting.get('lob')) team_stats['era'] = try_float(pitching.get('era')) # Drop None values and add TeamStats object to the to_load list team_stats = dict( (k, v) for k, v in team_stats.items() if v is not None) self.to_load.append(TeamStats(**team_stats))
def parse_team_stats(self, homeaway='home'): # Parse each team's stats, relative to a single game team_stats = {} batting = self.boxscore.find('batting', team_flag=homeaway) pitching = self.boxscore.find('pitching', team_flag=homeaway) team_stats['game_id'] = self.game_id team_stats['team_id'] = try_int(self.boxscore.get(homeaway + '_id')) team_stats['at_home'] = (homeaway == 'home') games_back_text = self.linescore.get(homeaway + '_games_back') games_back_wildcard_text = self.linescore.get(homeaway + '_games_back') # If a team is 0 games back, they'll be listed as '-'. The # games_back_wildcard is sometimes '-', sometimes missing in this case. # If they're 0 games back, set both to 0 if games_back_text == '-': team_stats['games_back'] = 0 team_stats['games_back_wildcard'] = 0 elif games_back_wildcard_text == '-': team_stats['games_back_wildcard'] = 0 team_stats['games_back'] = try_float(games_back_text) else: team_stats['games_back'] = try_float(games_back_text) team_stats['games_back_wildcard'] = try_float(games_back_wildcard_text) wins = try_int(self.boxscore.get(homeaway + '_wins', 0)) losses = try_int(self.boxscore.get(homeaway + '_loss', 0)) team_stats['wins'] = wins team_stats['losses'] = losses team_stats['winrate'] = 0 if (wins + losses) == 0 else wins / (wins + losses) team_stats['avg'] = try_float(batting.get('avg')) team_stats['at_bats'] = try_int(batting.get('ab')) team_stats['runs'] = try_int(batting.get('r')) team_stats['hits'] = try_int(batting.get('h')) team_stats['doubles'] = try_int(batting.get('d')) team_stats['triples'] = try_int(batting.get('t')) team_stats['home_runs'] = try_int(batting.get('hr')) team_stats['rbis'] = try_int(batting.get('rbi')) team_stats['walks'] = try_int(batting.get('bb')) team_stats['putouts'] = try_int(batting.get('po')) team_stats['da'] = try_int(batting.get('da')) team_stats['strikeouts'] = try_int(batting.get('so')) team_stats['left_on_base'] = try_int(batting.get('lob')) team_stats['era'] = try_float(pitching.get('era')) # Drop None values and add TeamStats object to the to_load list team_stats = dict((k, v) for k, v in team_stats.items() if v is not None) self.to_load.append(TeamStats(**team_stats))
def parse_batters(self): # Parses batter statistics and adds all batters to the to_load list for batter in self.boxscore.find_all('batter'): b = {} b['game_id'] = self.game_id homeaway = batter.parent.get('team_flag') b['team_id'] = try_int(self.boxscore.get(homeaway + '_id')) b['batter_id'] = try_int(batter.get('id')) b['name'] = batter.get('name') b['full_name'] = batter.get('name_display_first_last') b['avg'] = try_float(batter.get('avg')) b['batting_order'] = try_int(batter.get('bo')) b['at_bats'] = try_int(batter.get('ab')) b['strikeouts'] = try_int(batter.get('so')) b['flyouts'] = try_int(batter.get('ao')) b['hits'] = try_int(batter.get('h')) b['doubles'] = try_int(batter.get('d')) b['triples'] = try_int(batter.get('t')) b['home_runs'] = try_int(batter.get('hr')) b['walks'] = try_int(batter.get('bb')) b['hit_by_pitch'] = try_int(batter.get('hbp')) b['sac_bunts'] = try_int(batter.get('sac')) b['sac_flys'] = try_int(batter.get('fs')) b['rbi'] = try_int(batter.get('rbi')) b['assists'] = try_int(batter.get('a')) b['runs'] = try_int(batter.get('r')) b['left_on_base'] = try_int(batter.get('lob')) b['caught_stealing'] = try_int(batter.get('cs')) b['stolen_bases'] = try_int(batter.get('sb')) b['season_walks'] = try_int(batter.get('s_bb')) b['season_hits'] = try_int(batter.get('s_h')) b['season_home_runs'] = try_int(batter.get('s_hr')) b['season_runs'] = try_int(batter.get('s_r')) b['season_rbi'] = try_int(batter.get('s_rbi')) b['season_strikeouts'] = try_int(batter.get('s_so')) b['position'] = batter.get('pos') b['putouts'] = try_int(batter.get('po')) b['errors'] = try_int(batter.get('e')) b['fielding'] = try_float(batter.get('fldg')) # Drop None values and add to to_load list b = dict((k, v) for k, v in b.items() if v is not None) self.to_load.append(Batter(**b))
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = [] indptr = _make_int_array() values = _make_float_array() indptr.append(0) for doc in raw_documents: feature_counter = {} current_num = 0 for feature in analyze(doc): maybe_float = try_float(feature) if maybe_float > 0 and maybe_float <= 200: current_num = maybe_float continue try: if current_num == 0: continue feature_idx = vocabulary[feature] if feature_idx not in feature_counter: feature_counter[feature_idx] = current_num / 200 current_num = 0 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.frombuffer(values, dtype=np.float32) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=np.float32) X.sort_indices() return vocabulary, X
def parse_pitchers(self): # Parses pitcher statistics and adds all pitchers to the to_load list for pitcher in self.boxscore.find_all('pitcher'): p = {} p['pitcher_id'] = try_int(pitcher.get('id')) p['game_id'] = self.game_id homeaway = pitcher.parent.get('team_flag') p['team_id'] = try_int(self.boxscore.get(homeaway + '_id')) p['name'] = pitcher.get('name') p['full_name'] = pitcher.get('name_display_first_last') p['position'] = pitcher.get('pos') p['outs'] = try_int(pitcher.get('out')) p['batters_faced'] = try_int(pitcher.get('bf')) p['home_runs'] = try_int(pitcher.get('hr')) p['walks'] = try_int(pitcher.get('bb')) p['strikeouts'] = try_int(pitcher.get('so')) p['earned_runs'] = try_int(pitcher.get('er')) p['runs'] = try_int(pitcher.get('r')) p['hits'] = try_int(pitcher.get('h')) p['wins'] = try_int(pitcher.get('w')) p['losses'] = try_int(pitcher.get('l')) p['saves'] = try_int(pitcher.get('sv')) p['era'] = try_float(pitcher.get('era')) p['pitches_thrown'] = try_int(pitcher.get('np')) p['strikes'] = try_int(pitcher.get('s')) p['blown_saves'] = try_int(pitcher.get('bs')) p['holds'] = try_int(pitcher.get('hld')) p['season_innings_pitched'] = try_float(pitcher.get('s_ip')) p['season_hits'] = try_int(pitcher.get('s_h')) p['season_runs'] = try_int(pitcher.get('s_r')) p['season_earned_runs'] = try_int(pitcher.get('s_er')) p['season_walks'] = try_int(pitcher.get('s_bb')) p['season_strikeouts'] = try_int(pitcher.get('s_so')) p['game_score'] = try_int(pitcher.get('game_score')) p['blown_save'] = pitcher.get('blown_save') p['save'] = pitcher.get('save') p['loss'] = pitcher.get('loss') p['win'] = pitcher.get('win') # Drop None values and add to the to_load list p = dict((k, v) for k, v in p.items() if v is not None) self.to_load.append(Pitcher(**p))
def _parse_ppm(page): msg_list = [r'returned an error: (.*)$', r'(Too many residues)'] for msg in msg_list: error = re.findall(msg, page) if error: with open("ppm_error.txt", "w") as fp: fp.write(error[0]) raise Exception(error[0]) pdb_url = PPM_URL + re.findall(r'href="\./(pdb_upload/.*out\.pdb)"', page)[0] delta_g = re.findall(r'([-+]?[0-9]*\.?[0-9]+) kcal/mol', page)[0] info_dict = {"delta_g": try_float(delta_g)} return pdb_url, info_dict
def load_page(self, url): """ Loads URL to json :param url: :return: """ auth = None if 'github_token' in self.state and 'github_user' in self.state: auth = HTTPBasicAuth(self.state['github_user'], self.state['github_token']) for attempt in range(self.attempts): if self.terminate: raise Exception('Terminating') try: res = requests.get(url, timeout=10, auth=auth) headers = res.headers if res.status_code == 404: logger.warning('URL not found: %s' % url) return None, None, None self.rate_limit_reset = utils.try_float(headers.get('X-RateLimit-Reset')) + 10 self.rate_limit_remaining = utils.try_int(headers.get('X-RateLimit-Remaining')) if self.rate_limit_remaining is not None and self.rate_limit_remaining <= 1: sleep_sec = self.rate_limit_reset - time.time() logger.info('Rate limit exceeded, sleeping till: %d, it is %d seconds, %d minutes' % (self.rate_limit_reset, sleep_sec, sleep_sec / 60.0)) self.sleep_interruptible(self.rate_limit_reset) raise Exception('Rate limit exceeded') if res.status_code // 100 != 2: res.raise_for_status() data = res.content if data is None: raise Exception('Empty response') js = json.loads(data, object_pairs_hook=OrderedDict) return js, headers, res except Exception as e: logger.warning('Exception in loading page: %s, page: %s' % (e, url)) logger.warning('Skipping url: %s' % url) return None, None, None
def load_page_local(self): """ Loads page stored in thread local :return: """ auth = None resource = self.local_data.resource if resource.usr is not None: auth = HTTPBasicAuth(resource.usr, resource.token) job = self.local_data.job res = requests.get(job.url, timeout=10, auth=auth) headers = res.headers resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset')) resource.remaining = utils.try_int( headers.get('X-RateLimit-Remaining')) resource.last_used = time.time() resource.used_cnt += 1 if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10: resource.fail_cnt += 1 raise RateLimitHit if res.status_code == 404: resource.fail_cnt += 1 logger.warning('URL not found: %s' % job.url) return None, None, None if res.status_code // 100 != 2: resource.fail_cnt += 1 res.raise_for_status() data = res.content if data is None: resource.fail_cnt += 1 raise Exception('Empty response') js = json.loads(data, object_pairs_hook=OrderedDict) return js, headers, res
def _parse_opm_info(page): # check if there were no matches no_matches = re.findall(r'<h2>Search Results for ".*"</h2>No matches', page) if no_matches: return None # check if this page only points to a representative structure rep = re.findall( r'Representative structure\(s\) of this protein: <br /> ' r'<a href="protein\.php\?pdbid=([0-9a-zA-Z]{4})">', page) if rep: return {"representative": rep[0].upper()} opm_type = re.findall(r'<li><i>Type:</i> <a.*>(.*)</a>', page) opm_class = re.findall(r'<li><i>Class:</i> <a.*>(.*)</a>', page) opm_superfamily = re.findall( r'<li><i>Superfamily:</i> <a[^<]*>([^<]*)</a>', page) opm_family = re.findall(r'<li><i>Family:</i> <a[^<]*>([^<]*)</a>', page) opm_species = re.findall(r'<li><i>Species:</i> <i><a.*>(.*)</a></i>', page) opm_localization = re.findall(r'<li><i>Localization:</i> <a.*>(.*)</a>', page) related_ids = re.findall(r'"\?extrapdb=([0-9a-zA-Z]{4})"', page) related_ids = [x.upper() for x in related_ids] related_ids.sort() delta_g = re.findall(r'([-+]?[0-9]*\.?[0-9]+) kcal/mol', page) return { "type": opm_type[0].split(" ", 1)[1], "class": opm_class[0].split(" ", 1)[1], "superfamily": opm_superfamily[0].split(" ", 1)[1], "family": opm_family[0].split(" ", 1)[1], "species": opm_species[0].strip(), "localization": opm_localization[0], "related_ids": related_ids, "delta_g": try_float(get_index(delta_g, 0)) }
def transform(self, X): keys = { "1", "2", "3", "4", "5", "7", "8", "9", "a", "as", "at", "b", "bars", "beautiful", "boots", "bottles", "bowls", "box", "boxes", "brand", "bras", "bucks", "cans", "card", "cards", "case", "cm", "comes", "compartments", "controllers", "cream", "credit", "crop", "dd", "dollar", "dollars", "dolls", "dress", "dvds", "each", "edition", "euc", "fashion", "feet", "fits", "fl", "ft", "g", "games", "gb", "gms", "gold", "gram", "grams", "hr", "hrs", "in", "inch", "inches", "k", "karat", "layers", "up", "meter", "mil", "mini", "mint", "ml", "mm", "month", "mugs", "no", "not", "nwt", "off", "onesies", "opi", "ounce", "ounces", "outfits", "oz", "packages", "packets", "packs", "pair", "panels", "pants", "patches", "pc", "pics", "piece", "pieces", "pokémon", "pokemon", "pounds", "price", "protection", "random", "retro", "ring", "rings", "rolls", "samples", "sandals", "series", "sets", "sheets", "shirts", "shoe", "shoes", "shows", "slots", "small", "so", "some", "stamped", "sterling", "stickers", "still", "stretch", "strips", "summer", "t", "tags", "tiny", "tone", "tubes", "victoria", "vinyl", "w", "waist", "waistband", "waterproof", "watt", "white", "wireless", "x10", "x13", "x15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "yrs", "½", "lipsticks", "bar", "apple", "access", "wax", "monster", "spell", "spinners", "lunch", "ac", "jamberry", "medal", "gerard" } regex = re.compile("(\d+)[ ]?(\w+)", re.IGNORECASE) specifics = [] for x in X: spec = {} for val, key in regex.findall(str(x), re.IGNORECASE): if key in keys: val = try_float(val) if val > 3000: continue spec[key] = val spec['{}_{}'.format(key, val)] = 1 specifics.append(spec) return specifics
def parse_pitches(self): # Parse every pitch in all innings, adding them to the to_load list. pitch_counter = count() for pitch in self.innings.find_all('pitch'): # Some years are missing pitch_ids. Since we're using it as a key, # assign here and increment the counter p = {} p['game_id'] = self.game_id p['pitch_id'] = int(pitch.get('id', next(pitch_counter))) p['at_bat_number'] = try_int(pitch.parent.get('num')) p['description'] = pitch.get('des') p['type'] = pitch.get('type') try: t = dateutil.parser.parse(pitch.get('tfs_zulu', '')) p['timestamp'] = t.astimezone(timezone('America/New_York')) except ValueError: logging.warning('Could not parse timestamp: Game {}; pitch {}'.format( self.game_id, p['pitch_id'])) p['x'] = try_float(pitch.get('x')) p['y'] = try_float(pitch.get('y')) p['event_num'] = try_int(pitch.get('event_num')) p['sv_id'] = pitch.get('sv_id') p['play_guid'] = pitch.get('play_guid') p['start_speed'] = try_float(pitch.get('start_speed')) p['end_speed'] = try_float(pitch.get('end_speed')) p['sz_top'] = try_float(pitch.get('sz_top')) p['sz_bottom'] = try_float(pitch.get('sz_bot')) p['pfx_x'] = try_float(pitch.get('pfx_x')) p['pfx_z'] = try_float(pitch.get('pfx_z')) p['x0'] = try_float(pitch.get('x0')) p['y0'] = try_float(pitch.get('y0')) p['z0'] = try_float(pitch.get('z0')) p['vx0'] = try_float(pitch.get('vx0')) p['vy0'] = try_float(pitch.get('vy0')) p['vz0'] = try_float(pitch.get('vz0')) p['ax'] = try_float(pitch.get('ax')) p['ay'] = try_float(pitch.get('ay')) p['az'] = try_float(pitch.get('az')) p['break_y'] = try_float(pitch.get('break_y')) p['break_angle'] = try_float(pitch.get('break_angle')) p['break_length'] = try_float(pitch.get('break_length')) p['pitch_type'] = pitch.get('pitch_type') p['type_confidence'] = try_float(pitch.get('type_confidence')) p['zone'] = try_int(pitch.get('zone')) p['nasty'] = try_int(pitch.get('nasty')) p['spin_dir'] = try_float(pitch.get('spin_dir')) p['spin_rate'] = try_float(pitch.get('spin_rate')) # Drop None items and add Pitch to to_load p = dict((k, v) for k, v in p.items() if v is not None) logging.info(p) self.to_load.append(Pitch(**p))
def parse_pitches(self): # Parse every pitch in all innings, adding them to the to_load list. pitch_counter = count() for pitch in self.innings.find_all('pitch'): # Some years are missing pitch_ids. Since we're using it as a key, # assign here and increment the counter p = {} p['game_id'] = self.game_id p['pitch_id'] = int(pitch.get('id', next(pitch_counter))) p['at_bat_number'] = try_int(pitch.parent.get('num')) p['description'] = pitch.get('des') p['type'] = pitch.get('type') try: t = dateutil.parser.parse(pitch.get('tfs_zulu', '')) p['timestamp'] = t.astimezone(timezone('America/New_York')) except ValueError: logging.warning( 'Could not parse timestamp: Game {}; pitch {}'.format( self.game_id, p['pitch_id'])) p['x'] = try_float(pitch.get('x')) p['y'] = try_float(pitch.get('y')) p['event_num'] = try_int(pitch.get('event_num')) p['sv_id'] = pitch.get('sv_id') p['play_guid'] = pitch.get('play_guid') p['start_speed'] = try_float(pitch.get('start_speed')) p['end_speed'] = try_float(pitch.get('end_speed')) p['sz_top'] = try_float(pitch.get('sz_top')) p['sz_bottom'] = try_float(pitch.get('sz_bot')) p['pfx_x'] = try_float(pitch.get('pfx_x')) p['pfx_z'] = try_float(pitch.get('pfx_z')) p['x0'] = try_float(pitch.get('x0')) p['y0'] = try_float(pitch.get('y0')) p['z0'] = try_float(pitch.get('z0')) p['vx0'] = try_float(pitch.get('vx0')) p['vy0'] = try_float(pitch.get('vy0')) p['vz0'] = try_float(pitch.get('vz0')) p['ax'] = try_float(pitch.get('ax')) p['ay'] = try_float(pitch.get('ay')) p['az'] = try_float(pitch.get('az')) p['break_y'] = try_float(pitch.get('break_y')) p['break_angle'] = try_float(pitch.get('break_angle')) p['break_length'] = try_float(pitch.get('break_length')) p['pitch_type'] = pitch.get('pitch_type') p['type_confidence'] = try_float(pitch.get('type_confidence')) p['zone'] = try_int(pitch.get('zone')) p['nasty'] = try_int(pitch.get('nasty')) p['spin_dir'] = try_float(pitch.get('spin_dir')) p['spin_rate'] = try_float(pitch.get('spin_rate')) # Drop None items and add Pitch to to_load p = dict((k, v) for k, v in p.items() if v is not None) logging.info(p) self.to_load.append(Pitch(**p))
def compileInstanceResults(self, sInst): for mLog, lN in self.lResLogs: ## Select method and its name list lNames = self.getMethodName(lN) aDetThis = self.aDetThisInst[lNames] ## Result table line section if sInst in mLog: self.nReported += 1 aResultThisInst = OrderedDict({"n_Reported": 1}) aResultThisInst["n_CheckFailed"] = 0 mRes = mLog[sInst] ## The method's entry for this instance aDetThis["chk"] = "ok" if "SOLUTION_CHECKS_FAILED" in mRes and \ 0<mRes["SOLUTION_CHECKS_FAILED"]: aResultThisInst["n_CheckFailed"] = 1 aDetThis["chk"] = "BAD" utils.addMapValues(self.mCmpVecVals[lNames], aResultThisInst) print( "WARNING: SOLUTION CHECK(S) FAILED for the instance ", sInst, ", method '", lNames, "'.", sep='', file=self.ioBadChecks) continue ## TODO. Param? aResultThisInst["n_ErrorsBackend"] = 0 aResultThisInst["n_ErrorsLogical"] = 0 aDetThis["errH"] = 0 aDetThis["errL"] = 0 mSlv = mRes["__SOLVE__"] dObj_MZN = utils.try_float(mSlv.get("ObjVal_MZN")) aDetThis["objMZN"] = dObj_MZN dObj_SLV = utils.try_float(mSlv.get("ObjVal_Solver")) aDetThis["objSLV"] = dObj_SLV dBnd_SLV = utils.try_float(mSlv.get("DualBnd_Solver")) aDetThis["bnd"] = dBnd_SLV dTime_All = utils.try_float(mSlv.get("TimeReal_All")) aDetThis["tAll"] = dTime_All dTime_Flt = utils.try_float(mSlv.get("Time_Flt")) aResultThisInst[ "t_Flatten"] = dTime_Flt if dTime_Flt is not None else dTime_All ##?? aDetThis["tFlt"] = dTime_Flt dTime_Last = utils.try_float(mSlv.get("TimeReal_LastStatus")) aDetThis["tBest"] = dTime_Last ## Compare obj vals dObj, bObj_MZN = (dObj_MZN, True) if \ None!=dObj_MZN and abs( dObj_MZN ) < 1e45 else (mSlv.get("ObjVal_MZN"), False) ## Assuming solver value is better if different. WHY? Well it' happened both ways dObj, bObj_SLV = (dObj_SLV, True) if \ None!=dObj_SLV and abs( dObj_SLV ) < 1e45 else (dObj, False) if bObj_MZN and bObj_SLV: if abs(dObj_MZN - dObj_SLV) > 1e-6 * max( abs(dObj_MZN), abs(dObj_SLV)): aResultThisInst["n_ErrorsLogical"] += 1 aDetThis["errL"] += 1 print( " WARNING: DIFFERENT MZN / SOLVER OBJ VALUES for the instance ", sInst, ", method '", lNames, "' : ", dObj_MZN, " / ", dObj_SLV, sep='', file=self.ioContrObjValMZN) ## Retrieve solution status if "Sol_Status" in mSlv: n_SolStatus = mSlv["Sol_Status"][0] else: n_SolStatus = 0 ## Retrieve dual bound dBnd = None if None != dBnd_SLV and abs(dBnd_SLV) < 1e45: dBnd = dBnd_SLV self.lDualBnd.append(( dBnd_SLV, lNames)) ## Even infeas instances can have dual bound? ## Trying to deduce opt sense if not given: if 1 == len(self.sSenses): nSense = next(iter(self.sSenses.keys())) else: nSense = -2 ## ?? aDetThis["sns"] = self.mapProblemSense[nSense] self.bOptProblem = True if 0 != nSense else False ## or (None!=dBnd or None!=dObj) ### ... here assumed it's an opt problem by default... why... need to check bounds first?? ## Handle optimality / SAT completed if 2 == n_SolStatus: if not self.bOptProblem: self.lSatAll.append(lNames) aResultThisInst["n_SATALL"] = 1 aDetThis["stt"] = self.mapStatShort[4] else: ## Assume it's an optimization problem????? TODO self.lOpt.append( lNames) ## Append the optimal method list aResultThisInst["n_OPT"] = 1 aDetThis["stt"] = self.mapStatShort[2] if None == dObj or abs(dObj) >= 1e45: aResultThisInst["n_ErrorsLogical"] += 1 aDetThis["errL"] += 1 print( " WARNING: OPTIMAL STATUS BUT BAD OBJ VALUE, instance ", sInst, ", method '", lNames, "': '", ("" if None == dObj else str(dObj)), "', result record: ", # mRes, ",, dObj_MZN: ", dObj_MZN, sep='', file=self.ioBadObjValueStatusOpt) else: self.mOptVal[ dObj] = lNames ## Could have used OrderedDict self.lOptVal.append( (dObj, lNames)) ## To have both a map and the order self.lPrimBnd.append((dObj, lNames)) ## Handle feasibility / SAT elif 1 == n_SolStatus: if not self.bOptProblem: self.lSat.append(lNames) aResultThisInst["n_SAT"] = 1 aDetThis["stt"] = self.mapStatShort[3] else: ## Assume it's an optimization problem????? TODO self.lFeas.append( lNames) ## Append the optimal method list aResultThisInst["n_FEAS"] = 1 aDetThis["stt"] = self.mapStatShort[1] if None == dObj or abs(dObj) >= 1e45: aResultThisInst["n_ErrorsLogical"] += 1 aDetThis["errL"] += 1 print( " WARNING: feasible status but bad obj value, instance ", sInst, ", method '", lNames, "' :'", ("" if None == dObj else str(dObj)), "', result record: ", # mRes, sep='', file=self.ioBadObjValueStatusFeas) else: self.lPrimBnd.append((dObj, lNames)) ## Handle infeasibility elif -1 >= n_SolStatus and -3 <= n_SolStatus: self.lInfeas.append(lNames) aResultThisInst["n_INFEAS"] = 1 aDetThis["stt"] = self.mapStatShort[n_SolStatus] self.mInfeas.setdefault(sInst, []) self.mInfeas[sInst].append(lNames) ## Handle ERROR? elif -4 == n_SolStatus: aResultThisInst["n_ErrorsBackend"] = 1 aDetThis["errH"] += 1 aDetThis["stt"] = self.mapStatShort[ n_SolStatus] ## Should not happen TODO self.mError.setdefault(sInst, []).append(lNames) print( "ERROR REPORTED for the instance ", sInst, ", method '", lNames, "', result record: ", ## mRes, sep='', file=self.ioErrors) else: aResultThisInst["n_UNKNOWN"] = 1 aDetThis["stt"] = self.mapStatShort[0] ## Handle NOFZN if None == dTime_Flt: aResultThisInst["n_NOFZN"] = 1 self.mNoFZN.setdefault(sInst, []).append(lNames) ## Handle FAIL??? # LAST: utils.addMapValues(self.mCmpVecVals[lNames], aResultThisInst)
def _post_exec(self): if self.parallel and self.check_only: dct = collections.defaultdict(list) status_dct = {} tag_dct = {} db_records = [] for t in self.tool_list: check_info = t.check(full=True) cur_info = CURATED_INFO.get(t.pdb_id, {}) tag = re.split("/|\.", check_info)[0] if tag == "mppd": tag = "provi" if tag != "opm": # check if opm found two mplanes try: if len(t.opm.get_planes()) != 2: tag = "mplane" except IOError: # print tag, check_info, t.id pass else: if os.path.isfile(t.opm.outpath("ppm_error.txt")): tag = "ppm" # print open( t.opm.outpath( "ppm_error.txt" ) ).read() if tag != "pdb_info" and t.pdb_info.check(): info = t.pdb_info.get_info() if "CA ATOMS ONLY" in info.get("model_type", {}): tag = "calpha_only" if cur_info.get("backbone_only"): tag = "backbone_only" if "THEORETICAL MODEL" == info.get("experiment", ""): tag = "theoretical_model" res = info.get("resolution") if res and res >= 4.0 and res != "NOT": tag = "resolution" if info.get("obsolete"): tag = "obsolete" try: opm_info = t.opm_info.get_info() if opm_info and opm_info.get("type") != "Transmembrane": tag = "no_transmembrane" except: pass if cur_info.get("no_pdb_entry"): tag = "no_pdb_entry" if cur_info.get("no_transmembrane"): tag = "no_transmembrane" tag_dct[t.pdb_id] = tag dct[tag].append(t) # representative id search test_rep_list = flatten([ zip([x] * len(dct[x]), dct[x]) for x in ["opm", "ppm", "msms0", "msms_vdw_fin", "dowser"] ]) print test_rep_list for tag, t in test_rep_list: try: opm_info = t.opm_info.get_info() mpstruc_info = t.mpstruc_info.get_info() except: continue rid_list = [] if opm_info: rep_id = opm_info.get("representative") if rep_id: rid_list.append(rep_id.upper()) rid_list += opm_info.get("related_ids", []) if mpstruc_info: master_id = mpstruc_info.get("master") if master_id: rid_list.append(master_id.upper()) rid_list += mpstruc_info.get("related", []) rep = None for rid in rid_list: for x in dct["Ok"]: if x.pdb_id == rid: rep = x break else: continue break if rep: dct["representative"].append(t) else: dct["no_representative"].append(t) ignore_tags = [ "no_pdb_entry", "mplane", "representative", "no_representative", "no_transmembrane", "theoretical_model", "obsolete", ] # status types # included: all good # linked: only a representative available # pending: to be included # obsolete: superseeded # defect: low resolution; missing atoms # model: theoretical model for pdb_id, tag in tag_dct.iteritems(): if tag == "Ok": status = "included" elif tag == "obsolete": status = "obsolete" elif tag == "theoretical_model": status = "model" elif tag in ["calpha_only", "backbone_only"]: status = "backbone_only" elif tag == "resolution": status = "low_resolution" elif tag in ["opm", "ppm", "msms0", "msms_vdw_fin"]: if pdb_id in dct["representative"]: status = "linked" else: status = "pending" elif tag in ignore_tags: continue else: status = "unknown" print tag status_dct[pdb_id] = status for tag, t_list in dct.iteritems(): if tag != "Ok": print tag, " ".join(map(lambda x: x.id, t_list)) for tag, t_list in dct.iteritems(): print tag, len(t_list) if self.database: for tag, t_list in dct.iteritems(): if tag in ignore_tags: continue for t in t_list: t_info = t.get_info() if tag == "Ok": for s in t.get_stats(): if s['source'] == 'fin' and s[ 'segment'] == 'TM': t_stats = s break else: raise Exception('no stats found %s' % t.pdb_id) else: t_stats = {} cur_info = CURATED_INFO.get(t.pdb_id, {}) db_records.append( MppdDbRecord( t_info['pdb_id'], t_info['pdb_title'], ",".join(t_info['pdb_keywords']), t_info['pdb_experiment'], t_info['pdb_resolution'], t_info['opm_superfamily'], t_info['opm_family'], t_info['opm_representative'], t_info['opm_species'], ",".join(t_info['opm_related']), t_info['mpstruc_group'], t_info['mpstruc_subgroup'], t_info['mpstruc_name'], t_info['mpstruc_species'], t_info['mpstruc_master'], ",".join(t_info['mpstruc_related']), cur_info.get('representative', ""), ",".join(cur_info.get('related', [])), status_dct[t.pdb_id], t_stats.get('packdens_protein_buried'), t_stats.get('water_count'), t_stats.get('residue_count'), t_stats.get('msms'), )) db = SqliteBackend("mppd.db", MppdDbRecord) db.write(db_records) if self.extract: fdir = self.extract if not os.path.exists(fdir): os.makedirs(fdir) shutil.copyfile(self.outpath("mppd.db"), os.path.join(fdir, "mppd.db")) for t in dct.get('Ok', []): flist = [ t.original_dry_pdb, t.final_pdb, t.opm.mplane_file, t.hbexplore_fin.hbx_file + ".bonds", t.voronoia_fin.vol_file + ".atmprop", t.outpath("mppd.provi") ] flist += t.msms_vdw_fin.component_files() for fsrc in flist: fdst = os.path.join(fdir, t.id, t.relpath(fsrc)) if not os.path.exists(os.path.dirname(fdst)): os.makedirs(os.path.dirname(fdst)) shutil.copyfile(fsrc, fdst) if self.figures: alpha = 0.3 size = 7.0 nres = collections.defaultdict(list) nwater = collections.defaultdict(list) resolution = collections.defaultdict(list) ncav = collections.defaultdict(list) sesvol = collections.defaultdict(list) packdens = collections.defaultdict(list) packdens_buried = collections.defaultdict(list) for t in dct.get('Ok', []): stats = t.get_stats() info = t.get_info() for s in stats: if s["segment"] != "TM": continue key = (s["source"], s["segment"]) nres[key].append(s["residue_count"]) nwater[key].append(s["water_count"]) resolution[key].append( try_float(info["pdb_resolution"], 0.0)) ncav[key].append(s["msms"]) sesvol[key].append(s["msms_ses"]) packdens[key].append(s["packdens_protein"]) packdens_buried[key].append( s["packdens_protein_buried"]) print nres.keys() for key in nres.keys(): print key x = np.array(nwater[key]) y = np.array(nres[key]) x_y = x / y r = np.array(resolution[key]) cav = np.array(ncav[key]) cav_y = cav / y vol = np.array(sesvol[key]) * -1 vol_y = vol / y pd = np.array(packdens[key]) pd_buried = np.array(packdens_buried[key]) from mpl_toolkits.axes_grid.anchored_artists import ( AnchoredText) def hist(axis, x, label, loc=1, nzero=False): if nzero: x = x[x != 0] if len(x) == 0: x = np.array([0]) axis.hist(x, normed=True, bins=25) axis.set_xlabel(label) summary = ("Var: %.4f\nStd: %.4f\nMean: %.4f\n" "Median: %.4f\nMin: %.4f\nMax: %.4f\n") % ( np.var(x), x.std(), x.mean(), np.median(x), x.min(), x.max()) at = AnchoredText(summary, loc=loc or 1, prop={"size": 10}, frameon=True, pad=0.5, borderpad=1.0) axis.add_artist(at) def scatter(axis, x, y, xlabel, ylabel, loc=1, nzero=True): if nzero: xnzero = x != 0 ynzero = y != 0 x = x[xnzero & ynzero] y = y[xnzero & ynzero] try: r = pearsonr(x, y) except Exception: r = (np.nan, np.nan) axis.scatter(x, y, alpha=alpha, s=size) axis.set_xlabel(xlabel) axis.set_ylabel(ylabel) axis.set_ylim((0, axis.get_ylim()[1])) summary = "r: %.4f\np: %.4f\n" % r at = AnchoredText(summary, loc=loc or 1, prop={"size": 10}, frameon=True, pad=0.5, borderpad=1.0) axis.add_artist(at) fig, (ax) = plt.subplots(3, 4, figsize=[20, 12]) scatter(ax[0, 0], x, y, "#h20", "#res") hist(ax[0, 1], x_y, "#h2o / #res") scatter(ax[1, 0], r, x_y, "resolution [A]", "#h2o / #res") hist(ax[1, 1], cav_y, "#cav / #res") hist(ax[2, 0], vol_y, "ses_vol [A^3] / #res") hist(ax[0, 2], pd, "packing density") scatter(ax[1, 2], r, pd, "resolution [A]", "packing density") hist(ax[0, 3], pd_buried, "packing density buried") scatter(ax[1, 3], r, pd_buried, "resolution [A]", "packing density buried") fig.savefig("_".join(key) + ".png") def bar(ax, ydata, labels): y = [np.array(yd).mean() for yd in ydata] x = np.arange(len(y)) e = [np.array(yd).std() for yd in ydata] ax.bar(x, y, align='center', yerr=e, ecolor='black', facecolor='#777777') ax.set_xticks(x) ax.set_xticklabels(labels) xlim = (x.min() - 1, x.max() + 1) ax.set_xlim(xlim) # ... tm_keys = [("org", "TM"), ("fin", "TM"), ("dow", "TM")] if all(map(lambda k: k in nres, tm_keys)): ydata = [] labels = [] for key in tm_keys: ydata.append( (np.array(nwater[key]) / np.array(nres[key])) * 100) labels.append(key[0]) fig, (ax) = plt.subplots(1, 1, figsize=[6, 4.5]) bar(ax, ydata, labels) fig.savefig("h2o_per_100_res.png") # ... nwater_cutoff = collections.defaultdict(list) cutoff_list = np.arange(1.4, 3.9, step=0.1) for t in dct.get('Ok', []): stats2 = t.get_stats2() for s2 in stats2: key = s2["segment"] if not len(nwater_cutoff[key]): for cutoff in cutoff_list: nwater_cutoff[key].append([]) water_count = s2["water_count"] # residue_count = s2["residue_count"] count_list = s2["exp_water_cutoff_count"] for i, cutoff in enumerate(cutoff_list): frac = try_div(count_list[i], water_count) nwater_cutoff[key][i].append(frac) for key in nwater_cutoff.keys(): fig, (ax) = plt.subplots(1, 1, figsize=[8, 4]) bar(ax, nwater_cutoff[key], map(str, cutoff_list)) fig.savefig(str(key) + ".png")