Esempio n. 1
0
def make_counters_for_category(category, search_set, curs):
    bing_counter = collections.Counter()
    pop_counter = collections.Counter()
    for search in search_set:
        bing_results = get_bing_results(curs, search)
        pop_results = get_pop_rank_results(curs, search)
        for b_res, pop_res in zip(bing_results, pop_results):
            b_dom = tokenizer.get_tokens(b_res)[1]
            pop_dom = tokenizer.get_tokens(pop_res)[1]
            bing_counter[b_dom] += 1
            pop_counter[pop_dom] += 1

    b_hist_data = []
    b_hist_labels = []
    for domain, count in pop_counter.most_common(10):
        b_hist_data.append(count)
        b_hist_labels.append(domain)


    plt.bar(range(len(b_hist_data)), b_hist_data, align='center', color='red')
    plt.xticks(range(len(b_hist_data)), b_hist_labels, size='small', rotation=90)
    plt.legend(loc=3)
    plt.suptitle('Most Popular Domains for PopRank in %s' % category)
    plt.show()
    return (bing_counter, pop_counter)
    def get_html_elements(self):
        # put all terms inside with an empty list
        html_dict = {term: set() for term in self.tokens}

        # use Beautiful soup to get all the tags
        titles = self.soup.find_all("title")
        h1s = self.soup.find_all("h1")
        h2s = self.soup.find_all("h2")
        h3s = self.soup.find_all("h3")
        bolds = self.soup.find_all(["bold", "strong"])

        # get words inside those tags
        try:
            if titles:
                for title in titles:
                    words = tokenizer.get_tokens(title.findAll(text=True)[0])
                    for word in words:
                        html_dict[word].add("t")

            for h1 in h1s:
                text = h1.findAll(text=True)
                if text:
                    words = tokenizer.get_tokens(text[0])
                    for word in words:
                        html_dict[word].add("h1")

            for h2 in h2s:
                text = h2.findAll(text=True)
                if text:
                    words = tokenizer.get_tokens(text[0])
                    for word in words:
                        html_dict[word].add("h2")

            for h3 in h3s:
                text = h3.findAll(text=True)
                if text:
                    words = tokenizer.get_tokens(text[0])
                    for word in words:
                        html_dict[word].add("h3")

            for bold in bolds:
                text = bold.findAll(text=True)
                if text:
                    words = tokenizer.get_tokens(text[0])
                    for word in words:
                        html_dict[word].add("b")

        except:
            print("Something went wrong, but we will continue indexing")

        #print(html_dict)
        return html_dict
Esempio n. 3
0
def run_single_sentence(sentence, model, tknzr, maxlen):
    token_list = [get_tokens(sentence)]
    encoded_text = tknzr.texts_to_sequences(token_list)
    X = pad_sequences(encoded_text, maxlen=maxlen, padding='post')
    Y = model.predict(X)
    # print(Y)
    return Y[0][0] >= detect_thresh
Esempio n. 4
0
  def get_hdl(self):
    env = Environment(loader=PackageLoader('src', 'templates'))
    circuit_hdl_template = env.get_template('circuit_hdl_template')
    enable_template = env.get_template('enable_template')
    update_variable_template = env.get_template('update_variable_template')
    check_template = env.get_template('check_template')

    circuit_logic = ''

    if len(self.enables) > 0:
      condition = ' || '.join([
        ('instr_data = x"%s"' % program_counter) for program_counter in self.enables
      ])

      circuit_logic += enable_template.render(
        condition=condition, value='1'
      )

    if len(self.disables) > 0:
      condition = ' || '.join([
        ('instr_data = x"%s"' % program_counter) for program_counter in self.disables
      ])

      circuit_logic += enable_template.render(
        condition=condition, value='0'
      )

    for key in self.updates:
      circuit_logic += update_variable_template.render(
        pc=key, var_id=self.var_index[self.updates[key]]
      )

    condition = ' '.join(map(self.symbol_to_hdl, tokenizer.get_tokens(self.constraint)))
    circuit_logic += check_template.render(condition=condition)
    return circuit_hdl_template.render(name=self.name, circuit_logic=circuit_logic)
Esempio n. 5
0
 def get_tokenized_data(self, max_sentence_len):
     sents, is_intent = self.get_data()
     # token_list = (data['sentence'].apply(get_tokens))
     token_list = [get_tokens(sent) for sent in sents]
     tokenizer = Tokenizer()
     tokenizer.fit_on_texts(token_list)
     X, Y = self.get_netio(is_intent, token_list, max_sentence_len,
                           tokenizer)
     return X, Y, tokenizer
Esempio n. 6
0
    def query(self, q, file_names):
        results = []
        tokens = tokenizer.get_tokens(q)
        for token in tokens:
            if token == '&' or token == '|' or token == '!':
                results.append(token)
            else:
                results.append(self.query_token(token))

        return infixQueryEvaluator.evaluate_querylist(results, file_names)
    def setup(self):
        self.soup = BeautifulSoup(self.markup, "lxml")
        for script in self.soup(["script", "style"]):
            script.decompose()

        self.text = self.soup.get_text()

        # get rid of one letter tokens later??
        self.tokens = tokenizer.get_tokens(self.text)

        self.freq = tokenizer.get_freq_dict(self.tokens)

        self.html_dict = self.get_html_elements()

        # calculate simhash
        self.simhash = Simhash(self.text).value
        #print("SIMHASH: ", self.simhash)

        # remove duplicate tokens
        self.tokens = set(self.tokens)
Esempio n. 8
0
 def transition_to_hdl(transition):
   condition, source = transition
   condition = ' '.join(map(self.symbol_to_hdl, tokenizer.get_tokens(condition)))
   return '(rq(%d) AND (%s))' % (source, condition)