def _parse_mode(mode): options = ['r', 'w', 'tb', (False, True), 'sdmp', (SOCK_STREAM, SOCK_DGRAM, SOCK_RDM, SOCK_SEQPACKET), '@'] modeset = set(mode) if len(modeset) != len(mode): raise ValueError("invalid mode: %r" % mode) retvals = [''] while options: optstr = options.pop(0) if len(optstr) > 1: opt = concat(modeset.intersection(optstr)) or optstr[0] if len(opt) > 1: raise ValueError("mode can only have one of %r" % optstr) optvals = options.pop(0) val = optvals[optstr.index(opt)] else: val = optstr in modeset opt = optstr if val else '' modeset.discard(opt) retvals[0] += opt retvals.append(val) if modeset: raise ValueError("invalid mode options: %r" % concat(modeset)) return retvals
def get_data_cluster(train_data, test_data, submit_file, sorted_by): ''' 将train_data, test_data(如果存在submit_file则为test_data添加标签)混合并按sorted_by排序,生成Data_Cluster.csv文件 :param train_data: :param test_data: :param submit_file: :param sorted_by: :return: ''' train_data["text"] = train_data.apply( lambda x: concat(x["title"], x["text"]), axis=1) test_data["text"] = test_data.apply( lambda x: concat(x["title"], x["text"]), axis=1) train_data["category"] = "Train" test_data["category"] = "Test" if os.path.exists(os.path.join(data_path, "submit", submit_file)): submit = pd.read_csv(os.path.join(data_path, "submit", submit_file), encoding='utf-8') test_data = pd.merge(test_data, submit, on='id', how='left') else: test_data["negative"] = 0 test_data["key_entity"] = "" data = pd.concat((train_data[[ "id", "text", "category", "negative", "entity", "key_entity" ]], test_data[[ "id", "text", "category", "negative", "entity", "key_entity" ]]), axis=0).reset_index(drop=True) data.sort_values(by=sorted_by, inplace=True) data.to_csv(os.path.join(data_path, "Data_Cluster.csv"), encoding='utf-8', index=False)
def visit_Output(self, node, frame): body = [] for child in node.nodes: try: const = child.as_const(frame.eval_ctx) except nodes.Impossible: body.append(child) continue try: if frame.eval_ctx.autoescape: if hasattr(const, '__html__'): const = const.__html__() else: const = escape(const) const = str(const) except Exception: body.append(child) continue if body and isinstance(body[-1], list): body[-1].append(const) else: body.append([const]) # write a format string for the body format = [] arguments = [] for item in body: if isinstance(item, list): format.append(concat(item).replace('%', '%%')) else: format.append('%s') # TOFIX: item is a tuple/list/iterable arguments.append(item) self.writeline('yield ') self.write(repr(concat(format))) if arguments: self.write(' % (') self.indent() for argument in arguments: self.newline(argument) close = 0 if frame.eval_ctx.autoescape: self.write('escape(') close += 1 self.visit(argument, frame) self.write(')' * close) self.outdent() if len(arguments) == 1: # trailing comma for tuple with len 1 self.write(',') self.writeline(')')
def plot_mono_vs_di_likelihood(ll_dict = None): if ll_dict is None: ll_dict = likelihood_dict() normed_dict = {tf:tuple(map(lambda x:x/float(len(getattr(Escherichia_coli,tf))*len(getattr(Escherichia_coli,tf)[0])),(mono,di))) for (tf,(mono,di)) in ll_dict.items()} plt.scatter(*transpose(ll_dict.values())) for (tf,(mono,di)) in ll_dict.items(): sites = getattr(Escherichia_coli,tf) text = "%s\n#:%s\nw:%s\nIC:%1.2f" % (tf,len(sites),len(sites[0]),motif_ic(sites)) plt.annotate(text,(mono,di)) min_val = min(concat(ll_dict.values())) max_val = max(concat(ll_dict.values())) plt.xlabel("Mono LL") plt.ylabel("Di LL") plt.plot([min_val,max_val],[min_val,max_val],linestyle="--")
def get_train_and_validate_data(train_file, validate_file, negative_ratio=None): df_train = pd.read_csv(train_file) if negative_ratio: positive = df_train[df_train.label == 1] negative = df_train[df_train.label == 0].sample(positive.shape[0] * negative_ratio) df_train = utils.concat([positive, negative]).sample(frac=1).reset_index(drop=True) df_train = utils.preprocess_none_order_data(df_train) X_train = df_train.drop(['label'], axis=1) y_train = df_train['label'] del df_train gc.collect() df_validate = pd.read_csv(validate_file) Y_validate = df_validate[[UID, 'label']] df_validate = utils.preprocess_none_order_data(df_validate) X_validate = df_validate.drop(['label'], axis=1) del df_validate gc.collect() print 'train columns', len(X_train.columns) return X_train, y_train, X_validate, Y_validate
def parse_primary(self): token = self.token_stream.current lineno = token.lineno if token.value in ('True', 'False'): next(self.token_stream) return nodes.Const(token.value in ('True', 'False'), lineno=lineno) elif token.type is tokens.INTEGER: next(self.token_stream) return nodes.Const(int(token.value), lineno=lineno) elif token.type is tokens.FLOAT: next(self.token_stream) return nodes.Const(float(token.value), lineno=lineno) elif token.type is tokens.STRING: buffer = [next(self.token_stream).value] while self.token_stream.current.type is tokens.STRING: buffer.append(next(self.token_stream).value) return nodes.Const(concat(buffer), lineno=lineno) elif token.type is tokens.NAME: next(self.token_stream) return nodes.Name(token.value, 'load', lineno=lineno) elif token.type is tokens.LPAREN: next(self.token_stream) node = self.parse_tuple(explicit_parens=True) self.token_stream.expect(tokens.RPAREN) elif token.type is tokens.LBRACKET: node = self.parse_list() elif token.type is tokens.LBRACE: node = self.parse_dict() else: self.fail('unexpected character %r' % token.value, lineno) return node
def render(self, *args, **kwargs): vars = dict(*args, **kwargs) try: ctx = self.new_context(vars) return concat(self.root_render_func(ctx)) except Exception: raise
def get_entity_data(data): data = data[data["negative"] == 1] data["text"] = data.apply(lambda x: concat(x['title'], x['text']), axis=1) id = [] text = [] entities = [] label = [] for i in range(len(data)): entity = data["entity"].iloc[i].split(';') try: key_entity = data["key_entity"].iloc[i].split(';') except Exception as error: key_entity = [] for e in entity: if e is "": continue id.append(data["id"].iloc[i]) text.append(data["text"].iloc[i]) entities.append(e) if e in key_entity: label.append(1) else: label.append(0) entity_data = pd.DataFrame({ "id": id, "text": text, "entity": entities, "label": label }) return entity_data
def __init__(self, domain_name, *items, relations=None): relations = relations or [] self.name = domain_name self.items = list(set(list(items) + concat(relations))) self.relations = normalize_relations((relations) + [ (BOTTOM, item) for item in self.items ])
def get_deckHanzi(config, session, DeckSeen): # Get Hanzi from the database. This function has been carefully tuned to try and get good # performance, so be careful before you modify it! In particular: # 1) Doing *everything* in one huge query didn't work so well - perhaps sqlite's execution # is not so good? # 2) It's essential to trim the amount of text you look at by only looking at fields with names # like the ones we know about and suspect will contain Hanzi # Find all card models which come from Mandarin models cardmodels = session.all("select cardModels.id, cardModels.modelId, cardModels.qformat from cardModels, models where cardModels.modelId = models.id AND models.tags LIKE %s" % utils.toSqlLiteral("%" + config.modelTag + "%")) # Find the field names that are included in the *question field* of such cards cardmodelsfieldsnames = [(cmid, modelid, set([res["mappingkey"] for res in utils.parseFormatString(qformat) if isinstance(res, dict)])) for cmid, modelid, qformat in cardmodels] # Filter out unpromising names, and turn the remainder into the IDs of field models eligiblefields = set(utils.concat([config.candidateFieldNamesByKey[key] for key in ['expression', 'mw', 'trad', 'simp']])) cardmodelsfields = [(cmid, [session.scalar("select fieldModels.id from fieldModels where fieldModels.name = :name and fieldModels.modelId = :mid", name=fmname, mid=modelid) for fmname in fmnames if fmname in eligiblefields]) for cmid, modelid, fmnames in cardmodelsfieldsnames] # Look up the contents of fields whose Ids we found in the previous step, optionally only including # those whose corresponding card has been seen at least once hanziss = session.column0("SELECT fields.value FROM cards, fields WHERE cards.factId = fields.factId %s AND (%s)" % \ ((DeckSeen == 0) and "AND cards.reps > 0" or "", # Only look for seen cards if we are in that mode " OR ".join(["(cards.cardModelId = %s AND fields.fieldModelID IN %s)" % (utils.toSqlLiteral(cmid), utils.toSqlLiteral(fmids)) for cmid, fmids in cardmodelsfields]))) # Flatten everything into a set with *no intermediate structures* allhanzis = set() for hanzis in hanziss: allhanzis.update([c for c in hanzis if utils.isHanzi(c)]) return allhanzis
def predict_cases(self, n_steps, ibge_id, debug=False): def printVertexes(step, fileOut, vertexes): for vertex in vertexes: fileOut.write( concat( [step, vertex['id'], vertex['name'], vertex['value']], ',')) if debug: fileOut = open( 'logs/' + datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + '.csv', 'w') fileOut.write(concat(['step', 'ibge', 'name', 'newCases'], ',')) cumulative_cases_city = [] sum = 0 for i in range(n_steps): sum += self.graph.vs[self.dict_ibge_index[ibge_id]]['value'] cumulative_cases_city.append(sum) if debug: printVertexes(i, fileOut, self.graph.vs) self.autoUpdateCases(i + 1, ibge_id) if debug: fileOut.close() return cumulative_cases_city
def get_fileHanzi(file): try: f = codecs.open(file, "r", "utf8") return set(utils.concat([[c for c in line if utils.isHanzi(c)] for line in f.readlines()])) except IOError, e: log.exception("Error reading hanzi statistics character file " + file) return set()
def generate(self): result_pattern = "" if self.__option == 'LETTER': up, down, left, right = (True, True, True, True) if self.__framed else (False, False, False, False) if self.__direction == 'VERTICAL': for i, ch in enumerate(self.__pattern): up = True if i == 0 and self.__framed else False result_pattern += PatternGenerator(ch, self.__option, self.__style, self.__size)\ .generate(up, down, left, right) elif self.__direction == 'HORIZONTAL': for i, ch in enumerate(self.__pattern): left = True if i == 0 and self.__framed else False result_pattern = utils.concat( result_pattern, PatternGenerator(ch, self.__option, self.__style, self.__size).generate( up, down, left, right)) else: raise NotImplementedError( 'Direction only has vertical and horizontal options') else: raise NotImplementedError( 'Only letter option is available, right now') return result_pattern
def make_correlation_structure_by_length(): q = fdr(concat(euk_tests)) plt.close() # get rid of output from cluster_motif lens = map(len, euk_motifs) jss = [indices_where(lens, lambda x:10**i <= x < 10**(i+1)) for i in range(1, 4+1)] for i,js in tqdm(enumerate(jss)): analyze_mi_tests2(rslice(euk_tests, js), rslice(euk_motifs, js), label=str("10**%s" % (i+1)), q=q)
def get_relations(self): return concat([ [ self._bind_domain(arg, relation) for relation in arg.get_relations() ] for arg in self.args ])
def analyze_mi_tests2(tests, motifs, q=None, label=None): q = fdr(concat(tests)) correlated_percentage = count(lambda x:x <= q,(concat(tests)))/float(len(concat(tests))) ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))] for motif in motifs] def binom_ci(xs): """return width of error bar""" bs_means = sorted([mean(bs(xs)) for x in range(1000)]) mu = mean(xs) return (mu - bs_means[25], bs_means[975] - mu) tests_by_dist = [[t <= q for t,d in zip(concat(tests), concat(ds)) if d == i] for i in range(1, 20)] mean_vals = map(lambda xs:mean(xs) if xs else 0, tests_by_dist) cis = map(lambda xs:binom_ci(xs) if xs else (0,0), tests_by_dist) plt.errorbar(range(1,20), mean_vals,yerr=transpose(cis),label=label,capthick=1) plt.xlabel("Distance (bp)",fontsize="large") plt.ylabel("Proportion of Significant Correlations",fontsize="large") plt.legend()
def cell_likelihood(reads,ps): points = sorted(concat(reads)) G = len(ps) if not 0 in points: points.append(0) if not G in points: points.append(G) read_complements = [(stop)] return product([product(1-p for p in ps[start:stop]) for (start,stop) in reads])
def get_fileHanzi(file): try: f = codecs.open(file, "r", "utf8") return set( utils.concat([[c for c in line if utils.isHanzi(c)] for line in f.readlines()])) except IOError, e: log.exception("Error reading hanzi statistics character file " + file) return set()
def discrete_parallelogram_plot(filename=None): motifs = concat([maxent_motifs_with_ic(200,10,ic,10) for ic in tqdm(np.linspace(0.5,19.5,100))]) ics = map(motif_ic,motifs) mis = map(total_motif_mi,motifs) plt.scatter(ics,mis) plt.xlabel("IC (bits)") plt.ylabel("Pairwise MI (bits)") plt.title("IC vs Pairwise MI for MaxEnt Motifs") maybesave(filename)
def make_correlation_structure_by_cluster_figure(): from motif_clustering import cluster_motif q = fdr(concat(euk_tests)) euk_clusterses = [map(cluster_motif, tqdm(euk_motifs)) for i in range(3)] plt.close() # get rid of output from cluster_motif mean_lens = map(lambda xs:round(mean(xs)), transpose([map(len,cs) for cs in euk_clusterses])) jss = [indices_where(mean_lens, lambda x:x==i) for i in range(1, 5+1)] for i,js in tqdm(enumerate(jss)): analyze_mi_tests2(rslice(euk_tests, js), rslice(euk_motifs, js), label=str(i+1), q=q)
def pop_estimator(obs): """Given a vector of observed species counts obs, estimate total number of species by bs of shannon entropy""" N = float(sum(obs)) sample = concat([[i for _ in range(v)] for i,v in enumerate(obs)]) def resample_pop(): re_obs = Counter(bs(sample)).values() return 2**h([v/N for v in re_obs]) return [resample_pop() for i in range(100)]
def match(self, x): input_text = [ load_text(i, self.max_sentence_len, self.input2idx, self.choice) for i in x ] input_text = np.asarray(input_text) res = self.model.predict(input_text) res = concat(res) res = self.decode(res, True) return res
def parse_network(fname,connectivity=1): """parse network and return graph object""" with open(fname) as f: raw_lines = [line.strip().split(',') for line in f.readlines()] lines = [(source,target,int(sgn)) for (source,sgn,target) in raw_lines] all_names = list(set(concat([(source,target) for source,target,sgn in lines]))) idx_from_name = {name:i for (i,name) in enumerate(all_names)} name_from_idx = {i:name for (name,i) in idx_from_name.items()} processed_lines = [(idx_from_name[src],idx_from_name[trg],sgn) for (src,trg,sgn) in lines] return NetStruct(processed_lines,name_from_idx,connectivity=connectivity)
def _make_sockaddr(family, addr): family = _ADDRFAMILY_MAP.get(family, '<unknown>') if isinstance(addr, tuple): acc = [] for val in addr: val = str(val) if val is not None else '' if ':' in val: val = '[' + val + ']' acc.append(val) addr = concat(acc, ':') return ':{}:{}'.format(family, addr)
def analyze_column_frequencies(): """Do columnwise frequencies reveal stable patterns that could be explained by amino acid preferences?""" def dna_freqs(xs): return [xs.count(b)/float(len(xs)) for b in "ACGT"] all_freqs = concat([map(dna_freqs,transpose(getattr(tfdf_obj,tf))) for tf in tfdf_obj.tfs]) for k,(i,j) in enumerate(choose2(range(4))): plt.subplot(4,4,k) cols = transpose(all_freqs) plt.scatter(cols[i],cols[j])
def __repr__(self): rv = [self.__class__.__name__, '('] for idx, k in enumerate(self.fields): v = getattr(self, k) if isinstance(v, str): v = "'{}'".format(v.replace('\n', '\\n')) rv.append('{0}={1}'.format(k, v)) if idx != len(self.fields) - 1: rv.append(', ') rv.append(')') return concat(rv)
def __init__(self,adjacencies,names=None,connectivity=1): """adjacencies describes signed directed graph, i.e. edge i -> j encoded as (i,j,1), i -| j encoded as (i,j,-1) names is a dictionary of the form {i:i_name}. """ self.V = max(concat([[i,j] for (i,j,sgn) in adjacencies])) + 1 self.adjs = adjacencies self.names = names self.graph = nx.DiGraph([(src,trg) for (src,trg,sgn) in self.adjs]) #self.mat = scp.sparse.dok_matrix((self.V,self.V)) self.mat = np.zeros((self.V,self.V)) for (src,trg,sgn) in self.adjs: self.mat[src,trg] = sgn
def main(): xoffset = 0 yscale = 'log' data_path = 'datasets/data' output_path = 'results' spls = ['BSN'] labels = ['Reana', 'ReanaE'] if len(sys.argv) > 2: data_path = sys.argv[1] output_path = sys.argv[2] spls = sys.argv[3:] print(f'data_path: {data_path}') print(f'output_path: {output_path}') print(f'spls: {spls}') try: mkdir(output_path) except OSError as error: pass dirs = ['graphs', 'boxplots', 'pairwise-graphs', 'tables', 'tables/effect-size', 'tables/summary'] for path in dirs: try: mkdir(f'{output_path}/{path}') except OSError as error: pass # convert data to csv rt_data = concat([[f'running_time/totalTime{spl}{label}' for spl in spls] for label in labels]) mem_data = concat([[f'memory_usage/totalMemory{spl}{label}' for spl in spls] for label in labels]) for filename in rt_data: out_to_csv(f'{data_path}/{filename}.out', f'csv/{filename}.csv') for filename in mem_data: out_to_csv(f'{data_path}/{filename}.out', f'csv/{filename}.csv') for spl in spls: plot_spl(spl, labels, xoffset=xoffset, yscale=yscale, output_path=output_path) get_pairwise_graphs(spl, labels, xoffset=xoffset, yscale=yscale, output_path=output_path)
def handle(self, *args, **kwargs): ano = 2018 logger.debug('Downloading TSE {ano}...'.format(ano=ano)) base = 'http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_{ano}.zip' # noqa url = base.format(ano=ano) download = download_file(url) df = concat(download) logger.debug('total candidates: {shape}'.format(shape=df.shape)) fetch_parallel(df) logger.debug( f'candidates: {Candidate.objects.count()} expected: {df.shape[0]}')
def get_train_and_validate_data_from_cache(train_file, validate_file, only_validate=False, is_concat=False): print 'get_data train_file:{}, validate_file:{}, only_validate: {}, concat: {}'.format( train_file, validate_file, only_validate, is_concat) dtypes = dict( label=np.float32, ) embedings = list(range(32)) for col in embedings: dtypes[col] = np.float32 useless_cols = [ 'department_product_count', 'department_order_count', 'department_order_dow_mean', 'department_days_since_prior_order_mean', 'department_add_to_cart_order_mean', 'department_bought_times', 'department_reorder_ratio', 'aisle_add_to_cart_order_mean', 'aisle_product_count', 'aisle_days_since_prior_order_mean', 'aisle_order_hour_of_day_mean', 'aisle_order_dow_mean', 'aisle_bought_times', 'aisle_reorder_ratio', ] if not only_validate or is_concat: df_train = pd.read_csv(train_file, compact_ints=True, dtype=dtypes) y_train = df_train['label'] df_train.drop(['label']+useless_cols, axis=1, inplace=True) else: y_train = None df_train = None # df_validate, y_validate = None, None df_validate = pd.read_csv(validate_file, compact_ints=True, dtype=dtypes) y_validate = df_validate['label'] df_validate.drop([UID, 'label']+useless_cols, axis=1, inplace=True) if is_concat: df_train = utils.concat([df_train, df_validate]) df_validate = None y_train = y_train.append(y_validate, ignore_index=True) y_validate = None gc.collect() return df_train, y_train, df_validate, y_validate
def mean_field_hs(Vs, K): """ Pj(xj) = 1/Z0 *exp(-beta*hj(xj)), where hj(xj) = \sum_{<j,jp>} \sum_{xjp \in jp} V(xj,xjp)*Pjp(xjp) We assume a Potts model of m variables x0...xj...xm-1 where each variable can take on K states 0...i...K-1. Mean field functions h are represented as a matrix hss where each row gives the values hj(i). [Note that i,j are reversed from the usual row-column convention.] Input is a matrix Vs of pairwise contributions to the hamiltonian where Vs[j][jp] is a function V(xj,xjp) """ M = len(Vs) jpairs = pairs(range(M)) hs = [[1 for i in range(K)] for j in range(M)] def Pj(xj, j): # print xj,j return exp(-beta * hs[j][xj]) / sum(exp(-beta * hs[j][xjp]) for xjp in range(K)) old_hs = matcopy(hs) while True: for j in range(M): for i in range(K): hs[j][i] = sum(sum(Vs[j][jp](i, ip) * Pj(ip, jp) for ip in range(K)) for jp in range(j + 1, M)) + sum( sum(Vs[jp][j](ip, i) * Pj(ip, jp) for ip in range(K)) for jp in range(0, j - 1) ) print l2(concat(hs), concat(old_hs)) if old_hs == hs: break else: old_hs = matcopy(hs) print hs return hs
def fetch_row(self, table_name: str, row_number: int, column: int): if not self.has_table(table_name): return result = self._connection.execute(concat("select * from ", table_name)) rows = result.fetchall() if row_number is -1 or row_number >= len(rows): row_number = self.row_count(table_name) - 2 else: row_number -= 1 if column >= self.column_count(table_name): raise IndexError("Colum index %d out of range %d " % (column, len(result.keys()))) if column is -1: return rows[row_number][:] return rows[row_number][column]
def simulate(self, batch, turn=3): # for batch in data.train_iter: input_message = batch.hist1 history1 = input_message reward = 0 for t in range(turn): agent = self.agent[(t % 2)] logits_matrix, decoder_out = agent.generate( input_message) # type of decoder out = [data, lenght] reward += 1 # FIXME history2 = decoder_out input_message = concat(history1, history2) history1 = history2 return reward
def play(self, max_episodes, max_episode_len): total_r = 0 for i in range(max_episodes): # reset if terminated s = self.env.reset() g = self.env.sample_goal() self.env.render_goal() time.sleep(2) self.env.close_goal() # concat s_concat = concat(s, g) # write to monitor print('episode ' + str(i) + ' reward ' + str(total_r)) total_r = 0 for j in range(max_episode_len): # predict action a = self.actor.predict_target([s_concat])[0] # take action s_next, r, d, _ = self.env.step(a) self.env.render() s_next_concat = concat(s_next, g) total_r += r s_concat = s_next_concat if d: break self.env.close()
def baum_welch(obs,L): """Given sequence and bs length L, approximate MLE parameters for emission probabilities,transition rate a01 (background->site). TODO: non-uniform background frequencies""" states = range(L+1) a01 = random.random() start_p = make_start_p(a01) trans_p = make_trans_p(a01) emit_p = [simplex_sample(4) for state in states] hidden_states = [random.choice(states) for ob in obs] iterations = 0 while True: # compute hidden states, given probs prob,hidden_states_new = viterbi(obs, states, start_p, trans_p, emit_p) # compute probs, given hidden states # first compute a01 a01_new = estimate_a01(hidden_states_new) start_p_new = make_start_p(a01_new) trans_p_new = make_trans_p(a01_new) emit_p_new = estimate_emit_p(obs,hidden_states_new,states) if (start_p_new == start_p and trans_p_new == trans_p and emit_p_new == emit_p and hidden_states_new == hidden_states): break else: print iterations,a01,l2(start_p,start_p_new), print l2(concat(trans_p),concat(trans_p_new)), print l2((hidden_states),hidden_states_new) a01 = a01_new start_p = start_p_new trans_p = trans_p_new emit_p = emit_p_new hidden_states = hidden_states_new iterations += 1 return start_p,trans_p,emit_p,hidden_states
def get_results(iy_goals_1, iy_goals_2, ms_goals_1, ms_goals_2, h1=0, h2=0): res = {} res['mac'] = (ms_goals_1 > ms_goals_2, ms_goals_1 == ms_goals_2, ms_goals_1 < ms_goals_2) res['ilk'] = (iy_goals_1 > iy_goals_2, iy_goals_1 == iy_goals_2, iy_goals_1 < iy_goals_2) h_goals_1 = ms_goals_1 + h1 h_goals_2 = ms_goals_2 + h2 res['han'] = (h_goals_1 > h_goals_2, h_goals_1 == h_goals_2, h_goals_1 < h_goals_2) res['kar'] = (ms_goals_1 > 0 and ms_goals_2 > 0, ms_goals_1 == 0 or ms_goals_2 == 0) res['cif'] = (ms_goals_1 >= ms_goals_2, ms_goals_1 != ms_goals_2, ms_goals_1 <= ms_goals_2) res['iy'] = (iy_goals_1 + iy_goals_2 > 1.5, iy_goals_1 + iy_goals_2 < 1.5) total = ms_goals_1 + ms_goals_2 res['au1'] = (total > 1.5, total < 1.5) res['au2'] = (total > 2.5, total < 2.5) res['au3'] = (total > 3.5, total < 3.5) res['top'] = (total < 2, total >= 2 and total < 4, total >= 4 and total < 7, total >= 7) iy_diff = iy_goals_1 - iy_goals_2 ms_diff = ms_goals_1 - ms_goals_2 res['IYMS11'] = iy_diff > 0 and ms_diff > 0 res['IYMS10'] = iy_diff > 0 and ms_diff == 0 res['IYMS12'] = iy_diff > 0 and ms_diff < 0 res['IYMS01'] = iy_diff == 0 and ms_diff > 0 res['IYMS00'] = iy_diff == 0 and ms_diff == 0 res['IYMS02'] = iy_diff == 0 and ms_diff < 0 res['IYMS21'] = iy_diff < 0 and ms_diff > 0 res['IYMS20'] = iy_diff < 0 and ms_diff == 0 res['IYMS22'] = iy_diff < 0 and ms_diff < 0 for i in range(6): for j in range(6): res['SK%d%d' % (i, j)] = ms_goals_1 == i and ms_goals_2 == j if i == 5: res['SK%d%d' % (i, j)] = ms_goals_1 > 5 and ms_goals_2 == j if j == 5: res['SK%d%d' % (i, j)] = ms_goals_1 == i and ms_goals_2 > 5 res = concat([res[bet] for bet in BET_ORDER] + [[res[bet] for bet in IYMS_ORDER + SK_ORDER]]) return res
def arca_motif_comparison(): arca_reads = get_arca_reads() true_rdm = density_from_reads(arca_reads, G) pssm = make_pssm(Escherichia_coli.ArcA) plt.plot(true_rdm[0]) plt.plot(true_rdm[1]) fwd_scores, rev_scores = score_genome_np(pssm, genome) scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores)) sites = concat([(site, wc(site)) for site in Escherichia_coli.ArcA]) site_locations = [m.start(0) for site in sites for m in re.finditer(site, genome)] site_locations_np = np.zeros(G) for site_loc in site_locations: site_locations_np[site_loc] = 1 plt.plot(site_locations_np) plt.plot(scores)
def verify_checksum(): computed = options.action for arg in args: computed = utils.concat(computed, arg) computed = hashlib.md5(computed).hexdigest() if options.checksum == computed: if options.debug > 1: print ("Valid helper action checksum. Received: " + options.checksum + " Computed: " + computed) return True elif options.checksum == "SKIP": return True else: sys.stderr.write("Invalid action checksum! " + "Received: "+str(options.checksum) + " - " + "Expected: "+computed + "\n") return False
def verify_checksum(): computed = options.action for arg in args: computed = utils.concat(computed, arg) computed = hashlib.md5(computed).hexdigest() if options.checksum == computed: if options.debug > 1: print("Valid helper action checksum. Received: " + options.checksum + " Computed: " + computed) return True elif options.checksum == "SKIP": return True else: sys.stderr.write("Invalid action checksum! " + "Received: " + str(options.checksum) + " - " + "Expected: " + computed + "\n") return False
def make_ringer(code): """minimize eps(site) - mu + (sigma^2)/2""" def aa_mu(aa): return mean([code[aa, b1, b2] for b1, b2 in nuc_pairs]) def aa_sigma(aa): return sqrt(variance([code[aa, b1, b2] for b1, b2 in nuc_pairs])) (aa, b1, b2), min_score = min(code.items(), key=lambda ( (aa, b1, b2), score): score - aa_mu(aa) + (aa_sigma(aa)**2) / 2.0) bd = [aa] * (L - 1) sites = [ "".join(concat([(b1, b2) for j in range(L / 2)])) for i in range(n) ] return bd, sites
def make_gle_evo_sim_spoofs(bio_motifs, trials_per_motif = 3): start_time = time.time() spoofs = [] failures = 0 for it, motif in enumerate(tqdm(bio_motifs, desc='bio_motifs')): bio_ic = motif_ic(motif) these_spoofs = [spoof_motif_gle(motif,num_motifs=10, Ne_tol=10**-2) for i in range(trials_per_motif)] spoofs.append(these_spoofs) spoof_ics = map(motif_ic, concat(these_spoofs)) lb, ub = mean_ci(spoof_ics) out_of_bounds = (not (lb <= bio_ic <= ub)) failures += out_of_bounds fail_perc = failures/float(it+1) print it,"bio_ic:", bio_ic, "spoof_ci: (%s,%s)" % (lb, ub), "*" * out_of_bounds,"failures:","%1.2f" % fail_perc stop_time = time.time() print "total time:", stop_time - start_time return spoofs
def __init__(self, md, weekid): # md is match_data self.weekID = weekid self.matchID = int(md[10]) self.detailID = int(md[0]) self.datetime = datetime.strptime(md[7] + " " + md[6], '%d.%m.%Y %H:%M') self.league = md[26] self.team_1 = md[1] self.team_2 = md[3] self.mbs = parse_int(md[13]) self.iy_goals_1 = parse_int(md[11]) self.iy_goals_2 = parse_int(md[12]) if self.iy_goals_1 is None or self.iy_goals_2 is None: self.iy_goals_1 = None self.iy_goals_2 = None self.ms_goals_1 = None self.ms_goals_2 = None else: self.ms_goals_1 = parse_int(md[8]) self.ms_goals_2 = parse_int(md[9]) self.was_played = self.ms_goals_1 is not None self.h1 = 0 if md[14] == '' else int(md[14]) self.h2 = 0 if md[15] == '' else int(md[15]) self.ratios = [] res = {} res['mac'] = [parse_float(x) for x in md[16:19]] res['ilk'] = [parse_float(x) for x in md[33:36]] res['han'] = [parse_float(x) for x in md[36:39]] res['kar'] = [parse_float(x) for x in md[39:41]] res['cif'] = [parse_float(x) for x in md[19:22]] res['iy'] = [parse_float(x) for x in md[42:44]] res['au1'] = [parse_float(x) for x in md[44:46]] res['au2'] = [parse_float(x) for x in md[22:24]] res['au3'] = [parse_float(x) for x in md[46:48]] res['top'] = [parse_float(x) for x in md[29:33]] if self.was_played: self.results = get_results(self.iy_goals_1, self.iy_goals_2, self.ms_goals_1, self.ms_goals_2, self.h1, self.h2) self.ratios = concat([res[bet] for bet in BET_ORDER]) if self.league != 'DUEL': self.fetch_details()
def sanity_check(): G = 10000 config = [G/2] mfl = 250 lamb = 1.0/mfl num_frags = 10000 frags = concat([chip(G,config,mfl) for i in xrange(num_frags)]) min_seq_length = 75 sequenced_frags = filter(lambda (start,stop):stop - start > min_seq_length,frags) fd_frags,bk_frags = separate(lambda x:random.random() < 0.5,sequenced_frags) fd_reads = [('+',start,start+min_seq_length) for (start,stop) in fd_frags] bk_reads = [('-',stop-min_seq_length,stop) for (start,stop) in bk_frags] reads = fd_reads + bk_reads inferred_frags = exp_reconstruction(reads,lamb,G) plot_reads(reads,G=G) plt.plot(frag_density(frags,G=G),label="all frags") plt.plot(frag_density(sequenced_frags,G=G),label="seq frags") plt.plot((inferred_frags),label="inferred frags") plt.legend()
def get_results(iy_goals_1, iy_goals_2, ms_goals_1, ms_goals_2, h1=0, h2=0): res = {} res['mac'] = (ms_goals_1 > ms_goals_2, ms_goals_1 == ms_goals_2, ms_goals_1 < ms_goals_2) res['ilk'] = (iy_goals_1 > iy_goals_2, iy_goals_1 == iy_goals_2, iy_goals_1 < iy_goals_2) h_goals_1 = ms_goals_1 + h1 h_goals_2 = ms_goals_2 + h2 res['han'] = (h_goals_1 > h_goals_2, h_goals_1 == h_goals_2, h_goals_1 < h_goals_2) res['kar'] = (ms_goals_1 > 0 and ms_goals_2 > 0, ms_goals_1 == 0 or ms_goals_2 == 0) res['cif'] = (ms_goals_1 >= ms_goals_2, ms_goals_1 != ms_goals_2, ms_goals_1 <= ms_goals_2) res['iy'] = (iy_goals_1 + iy_goals_2 > 1.5, iy_goals_1 + iy_goals_2 < 1.5) total = ms_goals_1 + ms_goals_2 res['au1'] = (total > 1.5, total < 1.5) res['au2'] = (total > 2.5, total < 2.5) res['au3'] = (total > 3.5, total < 3.5) res['top'] = (total < 2, total >= 2 and total < 4, total >= 4 and total < 7, total >= 7) iy_diff = iy_goals_1 - iy_goals_2 ms_diff = ms_goals_1 - ms_goals_2 res['IYMS11'] = iy_diff > 0 and ms_diff > 0 res['IYMS10'] = iy_diff > 0 and ms_diff == 0 res['IYMS12'] = iy_diff > 0 and ms_diff < 0 res['IYMS01'] = iy_diff == 0 and ms_diff > 0 res['IYMS00'] = iy_diff == 0 and ms_diff == 0 res['IYMS02'] = iy_diff == 0 and ms_diff < 0 res['IYMS21'] = iy_diff < 0 and ms_diff > 0 res['IYMS20'] = iy_diff < 0 and ms_diff == 0 res['IYMS22'] = iy_diff < 0 and ms_diff < 0 for i in range(6): for j in range(6): res['SK%d%d' % (i, j)] = ms_goals_1 == i and ms_goals_2 == j if i==5: res['SK%d%d' % (i, j)] = ms_goals_1 > 5 and ms_goals_2 == j if j==5: res['SK%d%d' % (i, j)] = ms_goals_1 == i and ms_goals_2 > 5 res = concat([res[bet] for bet in BET_ORDER] + [[res[bet] for bet in IYMS_ORDER + SK_ORDER]]) return res
def recovery(): G = 10000 config = [G/2] mfl = 250 lamb = 1/float(mfl) num_frags = 1000 frags = concat([chip(G,config,mfl) for i in xrange(num_frags)]) min_seq_length = 75 sequenced_frags = filter(lambda (start,stop):stop - start > min_seq_length,frags) fd_frags,bk_frags = separate(lambda x:random.random() < 0.5,sequenced_frags) fd_reads = [('+',start,start+75) for (start,stop) in fd_frags] bk_reads = [('-',stop-75,stop) for (start,stop) in bk_frags] reads = fd_reads + bk_reads hyp0 = [int(random.random() < 0.5) for i in range(G)] def f(hyp): return log_likelihood(reads,hyp,lamb,G) def prop(hyp): i = random.randrange(G) hyp_copy = hyp[:] hyp_copy[i] = 1 - hyp_copy[i] return hyp_copy chain = mh(f,prop,hyp0,use_log=True,verbose=True)
def prokaryotic_gini_comparison(filename=None): """spoof prokaryotic motifs using maxent, uniform and GLE evosims, showing gini is higher in GLE than in maxent, uniform""" maxent_spoofs = [spoof_motifs_maxent(motif,10,verbose=True) for motif in tqdm(bio_motifs,desc='bio_motifs')] uniform_spoofs = [spoof_motifs_uniform(motif,10,verbose=True) for motif in tqdm(bio_motifs,desc='bio_motifs')] oo_spoofs = [spoof_motifs_oo(motif,10) for motif in tqdm(bio_motifs,desc='bio_motifs')] gle_spoofs = [concat([spoof_motif_gle(motif,10,verbose=True) for i in range(1)]) for motif in tqdm(bio_motifs,desc='bio_motifs')] maxent_ginis = [mean(map(motif_gini,spoofs)) for spoofs in maxent_spoofs] uniform_ginis = [mean(map(motif_gini,spoofs)) for spoofs in uniform_spoofs] gle_ginis = [mean(map(motif_gini,spoofs)) for spoofs in gle_spoofs] plt.subplot(1,2,1) scatter(maxent_ginis,gle_ginis) plt.xlabel("MaxEnt") plt.ylabel("GLE") plt.subplot(1,2,2) plt.xlabel("TU") scatter(uniform_ginis,gle_ginis) plt.suptitle("Gini Coefficients for GLE Simulations vs. MaxEnt, TU Distributions") maybesave(filename)
def make_chip_dataset(num_cells): return concat([chip(genome,rfd_xs(ps),MEAN_FRAGMENT_LENGTH) for i in verbose_gen(xrange(num_cells))])
def all_spoof_stats(fname,order_by_stat="motif_ic"): ordered_tfs = order_tfs_by(order_by_stat) return concat([[results_dict[tf][spoof_name][fname] for spoof_name in spoof_names] for tf in ordered_tfs])
# Only decks with these tags in them are processed "modelTags" : "Mandarin", # Field names are listed in descending order of priority "candidateFieldNamesByKey" : utils.let( ["MW", "Measure Word", "Classifier", "Classifiers", u"量词"], ["Audio", "Sound", "Spoken", u"声音"], lambda mwfields, audiofields: { 'expression' : ["Expression", "Hanzi", "Chinese", "Character", "Characters", u"汉字", u"中文"], 'reading' : ["Reading", "Pinyin", "PY", u"拼音"], 'meaning' : ["Meaning", "Definition", "English", "German", "French", u"意思", u"翻译", u"英语", u"法语", u"德语"], 'audio' : audiofields, 'color' : ["Color", "Colour", "Colored Hanzi", u"彩色"], 'mw' : mwfields, 'mwaudio' : utils.concat(utils.concat([[[x + " " + y, x + y] for x in mwfields] for y in audiofields])), #'weblinks' : ["Links", "Link", "LinksBar", "Links Bar", "Link Bar", "LinkBar", "Web", "Dictionary", "URL", "URLs"], #'pos' : ["POS", "Part", "Type", "Cat", "Class", "Kind", "Grammar"] , 'trad' : ["Traditional", "Trad", "Traditional Chinese", "HK", u'繁体字', u'繁体', u"繁體字", u"繁體"], 'simp' : ["Simplified", "Simp", "Simplified Chinese", u"简体字", u"简体"] }) } updatecontrolflags = { 'expression' : None, 'reading' : "readinggeneration", 'meaning' : "meaninggeneration", 'mw' : "detectmeasurewords", 'audio' : "audiogeneration", 'mwaudio' : "mwaudiogeneration", 'color' : "colorizedcharactergeneration",
def interpret_gle_evo_sim_spoofs(bio_motifs_, spoofs_,filename=None): # assume that structure of spoofs is such that all spoofs for bio_motifs[0] are contained in spoofs[0] trials_per_motif = len(spoofs_[0]) bio_motifs = [bio_motif for bio_motif in bio_motifs_ for i in range(trials_per_motif)] sim_motifs = concat(spoofs_) print len(bio_motifs), len(sim_motifs) assert len(bio_motifs) == len(sim_motifs) # bio_ics = [motif_ic(motif) for motif in bio_motifs # for _ in range(trials_per_motif)] bio_ics = map(motif_ic, bio_motifs) sim_ics = map(motif_ic, sim_motifs) # sim_ics = [mean(map(motif_ic,motifs)) # for spoof in spoofs for motifs in spoof] # bio_ginis = [motif_gini(motif) for motif in bio_motifs # for _ in range(trials_per_motif)] # sim_ginis = [mean(map(motif_gini,motifs)) # for spoof in spoofs for motifs in spoof] bio_ginis = map(motif_gini,bio_motifs) sim_ginis = map(motif_gini,sim_motifs) # bio_log_mis = [log(total_motif_mi(motif)) for motif in bio_motifs # for _ in range(trials_per_motif)] # sim_log_mis = map(log,[mean(map(total_motif_mi,motifs)) # for spoof in tqdm(spoofs) for # motifs in spoof]) lens = [len(motif[0]) for motif in bio_motifs] # bio_mis = [total_motif_mi(motif)/choose(l,2) # for (l, motif) in zip(lens, bio_motifs)] # sim_mis = [total_motif_mi(motif)/choose(l,2) # for (l, motif) in zip(lens, spoofs)] print "finding mutual information" bio_mis = [total_motif_mi(motif)/choose(l,2) for (l, motif) in tqdm(zip(lens, bio_motifs))] sim_mis = [total_motif_mi(motif)/choose(l,2) for (l, motif) in tqdm(zip(lens, sim_motifs))] print "finding motif structures" bio_patterns_ = [find_pattern(motif)[0] for motif in tqdm(bio_motifs_)] bio_patterns = [pattern for pattern in bio_patterns_ for _ in xrange(trials_per_motif)] pattern_colors = {'direct-repeat':'g','inverted-repeat':'b','single-box':'r'} colors = [pattern_colors[p] for p in bio_patterns] plt.subplot(1,3,1) plt.title("Motif IC (bits)") scatter(bio_ics,sim_ics,color=colors, line_color='black') ic_f = poly1d(polyfit(bio_ics, sim_ics,1)) #plt.plot(*pl(ic_f,[min(bio_ics),max(bio_ics)]),linestyle='--',color='b') plt.xlim(*find_limits(bio_ics, sim_ics)) plt.ylim(*find_limits(bio_ics, sim_ics)) plt.ylabel("Simulated") plt.subplot(1,3,2) plt.xlabel("Observed") plt.title("Gini Coefficient") scatter(bio_ginis,sim_ginis,color=colors, line_color='black') gini_f = poly1d(polyfit(bio_ginis, sim_ginis,1)) #plt.plot(*pl(gini_f,[min(bio_ginis),max(bio_ginis)]), # linestyle='--',color='b') plt.xlim(*find_limits(bio_ginis, sim_ginis)) plt.ylim(*find_limits(bio_ginis, sim_ginis)) plt.subplot(1,3,3) plt.title("Pairwise MI per pair (bits)") draft = False end = 10 if draft else 108 scatter(bio_mis,sim_mis,color=colors, line_color='black') mi_f = poly1d(polyfit(bio_mis, sim_mis,1)) # plt.plot(*pl(mi_f,[min(bio_mis),max(bio_mis)]), # linestyle='--',color='b') plt.xlim(*find_limits(bio_mis, sim_mis)) plt.ylim(*find_limits(bio_mis, sim_mis)) plt.legend() # #ax.set_bg_color('none') # ax.set_xlabel("Biological") # ax.set_ylabel("Simulated") plt.tight_layout() maybesave(filename)
def chip_ps_ref(ps,mean_frag_length,cells=10000): """Do a chip seq experiment given the distribution ps""" G = len(ps) return concat(chip_ps(rfd_xs(ps),mean_frag_length) for cell in verbose_gen(xrange(cells)))
def chip_ps_spec(ps,mean_frag_length,cells=10000): return concat(chip_ps_spec_single_cell(ps,mean_frag_length) for i in verbose_gen(xrange(cells)))
def show_chip_shadow(G,endpoints,mean_frag_length,cells=10000,trials=10): lamb = 1.0/mean_frag_length [plt.plot(map_reads(concat([chip(G,endpoints,mean_frag_length) for i in range(cells)]),G),color='b') for i in verbose_gen(range(trials))]
def chip_ps_np(ps,mean_frag_length,cells=10000,verbose=False): """Do a chip seq experiment given the distribution ps""" w = 10 G = len(ps)# + w - 1 #XXX HACK cell_iterator = verbose_gen(xrange(cells),modulus=1000) if verbose else xrange(cells) return concat(chip(G,rfd_xs_np(ps),mean_frag_length) for cell in cell_iterator)
print "%i %i %i\t %.2f %.2f %.4f %.4f %.2f - %.2fm" % ( n,m,l, np.mean(scores[(m,l)]), np.std(scores[(m,l)]), # for best params & variance np.mean(col_trscores), np.mean(col_cvscores), # use x diagnostic training set overfit tn.dot(np.mean(ptscores[(m,l)],axis=0)[1:]), # score (time()-t)/60), # k-fold time print " ".join(["%.5f" %pts for pts in np.mean(ptscores[(m,l)],axis=0)]), print " ".join(["%.5f" %pts for pts in np.std(ptscores[(m,l)],axis=0)]) if submit: # MAKE SUBMISSION # very complicated way to keep only the latest shopping_pt for each customer just to have everything in one row!!!!!11 test = test[test.shopping_pt == test.reset_index().customer_ID.map(test.reset_index().groupby('customer_ID').shopping_pt.max())] Xt = test[con+cat+conf+extra] # TEST SET PREDICTION print "now predicting on test set...", allpreds = rfs.predict(Xt) test['pG'] = majority_vote(test.G,allpreds[:,selected]); print "done" # Fix state law products, then concatenate to string stateFix(encoders,test,['C','D','pG'],1) test['plan'] = concat(test,['A','B','C','D','E','F','pG']) test['plan'].to_csv('submission\\majority_rfs%i_%i.%i_shuffle_GAfix_%iof%iof%i.csv' % ( n,m,l,NS/2+1,NS,N),header=1) # features importances impf = rfs.impf; impf.sort()
def occs_from_direct_sampling(samples,ks): """Given a _list_ of samples and ks, compute occupancies""" num_samples = float(len(samples)) counts = Counter(concat(samples)) G = len(ks) return [counts[i]/num_samples for i in range(G)]
def test_baum_welch(): site = [0,1,2,3,0,1,2,3] L = 8 background = lambda n:[random.choice(range(4)) for i in range(n)] obs = concat([site + background(10) for i in range(100)]) return baum_welch(obs,L)
def make_frag_lengths(lamb,trials): return frag_lengths(concat([make_frags(lamb) for trial in range(trials)]))