def query(input):

    # Preprocess query
    input = textprocessing.preprocess_text(input, stopwords)
    input = [word for word in input if word in dictionary]
    query = Counter(input)

    # Compute weights for words in query
    for word, value in query.items():
        query[word] = inverted_index[word]['idf'] * (1 + math.log(value))

    helpers.normalize(query)

    scores = [[i, 0] for i in range(len(docs))]
    for word, value in query.items():
        for doc in inverted_index[word]['postings_list']:
            index, weight = doc
            scores[index][1] += value * weight

    scores.sort(key=lambda doc: doc[1], reverse=True)

    print('----- Results ------ ')
    for index, score in enumerate(scores):
        if score[1] == 0:
            break
        print('{}. {} - {}'.format(index + 1, docs[score[0]], score[1]))
Ejemplo n.º 2
0
def predict_for_file(input_file,
                     output_file,
                     model,
                     batch_size=32,
                     to_normalize=False):
    test_data = read_lines(input_file)
    predictions = []
    cnt_corrections = 0
    batch = []
    for sent in test_data:
        batch.append(sent.split())
        if len(batch) == batch_size:
            preds, cnt = model.handle_batch(batch)
            predictions.extend(preds)
            cnt_corrections += cnt
            batch = []
    if batch:
        preds, cnt = model.handle_batch(batch)
        predictions.extend(preds)
        cnt_corrections += cnt

    result_lines = [" ".join(x) for x in predictions]
    if to_normalize:
        result_lines = [normalize(line) for line in result_lines]

    with open(output_file, 'w') as f:
        f.write("\n".join(result_lines) + '\n')
    return cnt_corrections
Ejemplo n.º 3
0
    def init(self):
        if self.url:
            self.request_url()

        self.content = normalize(self.content)
        self.prepare()
        self.distribute()
        self.modify_extract()
Ejemplo n.º 4
0
def run_model_estimation(int_point,
                         y_elec,
                         bad_trials,
                         surprise_reg=None,
                         model_type="OLS"):
    """
    Inputs: int_point - sampling point in interstimulus interval
            y_elec - array with eeg recordings (num_trials x num_interstim_rec)
            surprise_reg - num_trials x 1 surprise from Baye learning model
            model_type - regression model
    Output: Time-series of log model evidence/Negative free energy
            from VI on Bayesian model
    """
    # Normalize the data and regressor to lie within 0, 1
    y_std = normalize(y_elec[:, int_point])
    surprise_reg_std = normalize(surprise_reg)

    # Select specific model OLS/Hierarchical
    if model_type == "OLS":
        model = OLS_model(y_std, bad_trials, surprise_reg_std)
    elif model_type == "Hierarchical":
        model = Hierarchical_model(y_std, bad_trials, surprise_reg_std)
    elif model_type == "Bayesian-MLP":
        model = Bayesian_NN(y_std, bad_trials, surprise_reg_std)
    elif model_type == "Null":
        model = Null_model(y_std, bad_trials)
    else:
        raise "Provide a valid model type"

    # Run the Variational Inference scheme with ADVI
    # ADVI - Automatic Differentiation VI
    with model:
        inference = pm.ADVI()
        approx = pm.fit(
            method=inference,
            callbacks=[
                pm.callbacks.CheckParametersConvergence(diff='absolute')
            ],
            n=30000,
            progressbar=0)
    # return full optimization trace of free energy
    return -approx.hist
def preproc_surprisal(SP, AP, TP, norm):
    time = SP["time"]
    hidden = SP["hidden"]
    sequence = SP["sequence"]
    sequence[sequence == 2] = 0.5
    hidden[hidden == 2] = 0.5

    catch_id = np.argwhere(sequence == 0.5)

    if norm:
        PS = [
            normalize(SP["predictive_surprise"], catch_id),
            normalize(AP["predictive_surprise"], catch_id),
            normalize(TP["predictive_surprise"], catch_id)
        ]
        BS = [
            normalize(SP["bayesian_surprise"], catch_id),
            normalize(AP["bayesian_surprise"], catch_id),
            normalize(TP["bayesian_surprise"], catch_id)
        ]
        CS = [
            normalize(SP["confidence_corrected_surprise"], catch_id),
            normalize(AP["confidence_corrected_surprise"], catch_id),
            normalize(TP["confidence_corrected_surprise"], catch_id)
        ]
    else:
        PS = [
            SP["predictive_surprise"], AP["predictive_surprise"],
            TP["predictive_surprise"]
        ]
        BS = [
            SP["bayesian_surprise"], AP["bayesian_surprise"],
            TP["bayesian_surprise"]
        ]
        CS = [
            SP["confidence_corrected_surprise"],
            AP["confidence_corrected_surprise"],
            TP["confidence_corrected_surprise"]
        ]
    return time, hidden, sequence, PS, BS, CS
Ejemplo n.º 6
0
    def process(cls, task):
        process_type = task['process_type']
        if process_type == ProcessType.XPATH:
            mapping = cls.process_xpath(task['content'], task['rules'])
        elif process_type == ProcessType.CSS_SELECT:
            mapping = cls.process_css_select(task['content'], task['rules'])
        elif process_type == ProcessType.JSON:
            mapping = cls.process_json(task['content'], task['rules'])
        elif process_type == ProcessType.AUTO_MATCH:
            mapping = cls.process_auto_match(task['content'], task['rules'])
            mapping['content'] = normalize(task['content'])
        else:
            raise Exception

        # 通知 scheduler 进行后续链接爬取
        valid_links = extract_valid_links(task['content'], task['valid_link_regex'], task['domain'])
        result = cls.prepare_result(task['proj_id'], task['url'], task['catalog'], task['domain'], task['task_id'],
                                    mapping,
                                    valid_links)
        return result
stopwords = helpers.get_stopwords(stopwords_file)

docs = helpers.get_docs(dataset_path)

corpus = []
for doc in docs:
    with open(doc, mode='r') as f:
        text = f.read()
        words = textprocessing.preprocess_text(text, stopwords)
        bag_of_words = Counter(words)
        corpus.append(bag_of_words)

idf = helpers.compute_idf(corpus)
for doc in corpus:
    helpers.compute_weights(idf, doc)
    helpers.normalize(doc)

inverted_index = helpers.build_inverted_index(idf, corpus)

docs_file = os.path.join(data_path, 'docs.pickle')
inverted_index_file = os.path.join(data_path, 'inverted_index.pickle')
dictionary_file = os.path.join(data_path, 'dictionary.txt')

# Serialize data
with open(docs_file, 'wb') as f:
    pickle.dump(docs, f)

with open(inverted_index_file, 'wb') as f:
    pickle.dump(inverted_index, f)

with open(dictionary_file, 'w') as f:
Ejemplo n.º 8
0
stopwords = helpers.get_stopwords(stopwords_file)

dictionary = set(inverted_index.keys())

# Get query from command line
query = sys.argv[1]
# Preprocess query
query = textprocessing.preprocess_text(query, stopwords)
query = [word for word in query if word in dictionary]
query = Counter(query)

# Compute weights for words in query
for word, value in query.items():
    query[word] = inverted_index[word]['idf'] * (1 + math.log(value))

helpers.normalize(query)

scores = [[i, 0] for i in range(len(docs))]
for word, value in query.items():
    for doc in inverted_index[word]['postings_list']:
        index, weight = doc
        scores[index][1] += value * weight

scores.sort(key=lambda doc: doc[1], reverse=True)

print('----- Results ------ ')
for index, score in enumerate(scores):
    if score[1] == 0:
        break
    print('{}. {} - {}'.format(index + 1, docs[score[0]], score[1]))
Ejemplo n.º 9
0
def index():
    if request.method == 'POST':
        file = request.files['query_data']
        filename = secure_filename(file.filename)
        f = filename.split('.')
        if f[1] == 'jpg':
            img = Image.open(file.stream)  # PIL image
            uploaded_img_path = "static/uploaded/" + datetime.now().isoformat() + "_" + file.filename
            img.save(uploaded_img_path)

            query = fe.extract(img)
            dists = np.linalg.norm(features - query, axis=1)  # Do search
            ids = np.argsort(dists)[:30]  # Top 30 results
            scores = [(dists[id], img_paths[id]) for id in ids]

            return render_template('index.html',
                                   query_path=uploaded_img_path,
                                   scores=scores)
        else:
            query = file.read().decode("utf-8")
            docs_file = os.path.join(os.getcwd(), 'data', 'docs.pickle')
            inverted_index_file = os.path.join(
                os.getcwd(), 'data', 'inverted_index.pickle')

            stopwords_file = os.path.join(os.getcwd(), 'resources', 'stopwords_en.txt')

            # Deserialize data
            with open(docs_file, 'rb') as f:
                docs = pickle.load(f)
            with open(inverted_index_file, 'rb') as f:
                inverted_index = pickle.load(f)

            stopwords = helpers.get_stopwords(stopwords_file)

            dictionary = set(inverted_index.keys())

            # Get query from command line
            # Preprocess query

            query = textprocessing.preprocess_text(query, stopwords)
            query = [word for word in query if word in dictionary]
            query = Counter(query)

            # Compute weights for words in query
            for word, value in query.items():
                query[word] = inverted_index[word]['idf'] * (1 + math.log(value))

            helpers.normalize(query)

            scores = [[i, 0] for i in range(len(docs))]
            for word, value in query.items():
                for doc in inverted_index[word]['postings_list']:
                    index, weight = doc
                    scores[index][1] += value * weight

            scores.sort(key=lambda doc: doc[1], reverse=True)

            all_docs = []
            all_scores = []
            for index, score in enumerate(scores):
                if score[1] == 0:
                    break
                all_docs.append(docs[score[0]])
                all_scores.append(score[1])
            return render_template('docindex.html',
                     query_path=secure_filename(file.filename),
                     docs=zip(all_docs,all_scores))

    else:
        return render_template('index.html')