def make_counters_for_category(category, search_set, curs): bing_counter = collections.Counter() pop_counter = collections.Counter() for search in search_set: bing_results = get_bing_results(curs, search) pop_results = get_pop_rank_results(curs, search) for b_res, pop_res in zip(bing_results, pop_results): b_dom = tokenizer.get_tokens(b_res)[1] pop_dom = tokenizer.get_tokens(pop_res)[1] bing_counter[b_dom] += 1 pop_counter[pop_dom] += 1 b_hist_data = [] b_hist_labels = [] for domain, count in pop_counter.most_common(10): b_hist_data.append(count) b_hist_labels.append(domain) plt.bar(range(len(b_hist_data)), b_hist_data, align='center', color='red') plt.xticks(range(len(b_hist_data)), b_hist_labels, size='small', rotation=90) plt.legend(loc=3) plt.suptitle('Most Popular Domains for PopRank in %s' % category) plt.show() return (bing_counter, pop_counter)
def get_html_elements(self): # put all terms inside with an empty list html_dict = {term: set() for term in self.tokens} # use Beautiful soup to get all the tags titles = self.soup.find_all("title") h1s = self.soup.find_all("h1") h2s = self.soup.find_all("h2") h3s = self.soup.find_all("h3") bolds = self.soup.find_all(["bold", "strong"]) # get words inside those tags try: if titles: for title in titles: words = tokenizer.get_tokens(title.findAll(text=True)[0]) for word in words: html_dict[word].add("t") for h1 in h1s: text = h1.findAll(text=True) if text: words = tokenizer.get_tokens(text[0]) for word in words: html_dict[word].add("h1") for h2 in h2s: text = h2.findAll(text=True) if text: words = tokenizer.get_tokens(text[0]) for word in words: html_dict[word].add("h2") for h3 in h3s: text = h3.findAll(text=True) if text: words = tokenizer.get_tokens(text[0]) for word in words: html_dict[word].add("h3") for bold in bolds: text = bold.findAll(text=True) if text: words = tokenizer.get_tokens(text[0]) for word in words: html_dict[word].add("b") except: print("Something went wrong, but we will continue indexing") #print(html_dict) return html_dict
def run_single_sentence(sentence, model, tknzr, maxlen): token_list = [get_tokens(sentence)] encoded_text = tknzr.texts_to_sequences(token_list) X = pad_sequences(encoded_text, maxlen=maxlen, padding='post') Y = model.predict(X) # print(Y) return Y[0][0] >= detect_thresh
def get_hdl(self): env = Environment(loader=PackageLoader('src', 'templates')) circuit_hdl_template = env.get_template('circuit_hdl_template') enable_template = env.get_template('enable_template') update_variable_template = env.get_template('update_variable_template') check_template = env.get_template('check_template') circuit_logic = '' if len(self.enables) > 0: condition = ' || '.join([ ('instr_data = x"%s"' % program_counter) for program_counter in self.enables ]) circuit_logic += enable_template.render( condition=condition, value='1' ) if len(self.disables) > 0: condition = ' || '.join([ ('instr_data = x"%s"' % program_counter) for program_counter in self.disables ]) circuit_logic += enable_template.render( condition=condition, value='0' ) for key in self.updates: circuit_logic += update_variable_template.render( pc=key, var_id=self.var_index[self.updates[key]] ) condition = ' '.join(map(self.symbol_to_hdl, tokenizer.get_tokens(self.constraint))) circuit_logic += check_template.render(condition=condition) return circuit_hdl_template.render(name=self.name, circuit_logic=circuit_logic)
def get_tokenized_data(self, max_sentence_len): sents, is_intent = self.get_data() # token_list = (data['sentence'].apply(get_tokens)) token_list = [get_tokens(sent) for sent in sents] tokenizer = Tokenizer() tokenizer.fit_on_texts(token_list) X, Y = self.get_netio(is_intent, token_list, max_sentence_len, tokenizer) return X, Y, tokenizer
def query(self, q, file_names): results = [] tokens = tokenizer.get_tokens(q) for token in tokens: if token == '&' or token == '|' or token == '!': results.append(token) else: results.append(self.query_token(token)) return infixQueryEvaluator.evaluate_querylist(results, file_names)
def setup(self): self.soup = BeautifulSoup(self.markup, "lxml") for script in self.soup(["script", "style"]): script.decompose() self.text = self.soup.get_text() # get rid of one letter tokens later?? self.tokens = tokenizer.get_tokens(self.text) self.freq = tokenizer.get_freq_dict(self.tokens) self.html_dict = self.get_html_elements() # calculate simhash self.simhash = Simhash(self.text).value #print("SIMHASH: ", self.simhash) # remove duplicate tokens self.tokens = set(self.tokens)
def transition_to_hdl(transition): condition, source = transition condition = ' '.join(map(self.symbol_to_hdl, tokenizer.get_tokens(condition))) return '(rq(%d) AND (%s))' % (source, condition)