def query(input): # Preprocess query input = textprocessing.preprocess_text(input, stopwords) input = [word for word in input if word in dictionary] query = Counter(input) # Compute weights for words in query for word, value in query.items(): query[word] = inverted_index[word]['idf'] * (1 + math.log(value)) helpers.normalize(query) scores = [[i, 0] for i in range(len(docs))] for word, value in query.items(): for doc in inverted_index[word]['postings_list']: index, weight = doc scores[index][1] += value * weight scores.sort(key=lambda doc: doc[1], reverse=True) print('----- Results ------ ') for index, score in enumerate(scores): if score[1] == 0: break print('{}. {} - {}'.format(index + 1, docs[score[0]], score[1]))
def predict_for_file(input_file, output_file, model, batch_size=32, to_normalize=False): test_data = read_lines(input_file) predictions = [] cnt_corrections = 0 batch = [] for sent in test_data: batch.append(sent.split()) if len(batch) == batch_size: preds, cnt = model.handle_batch(batch) predictions.extend(preds) cnt_corrections += cnt batch = [] if batch: preds, cnt = model.handle_batch(batch) predictions.extend(preds) cnt_corrections += cnt result_lines = [" ".join(x) for x in predictions] if to_normalize: result_lines = [normalize(line) for line in result_lines] with open(output_file, 'w') as f: f.write("\n".join(result_lines) + '\n') return cnt_corrections
def init(self): if self.url: self.request_url() self.content = normalize(self.content) self.prepare() self.distribute() self.modify_extract()
def run_model_estimation(int_point, y_elec, bad_trials, surprise_reg=None, model_type="OLS"): """ Inputs: int_point - sampling point in interstimulus interval y_elec - array with eeg recordings (num_trials x num_interstim_rec) surprise_reg - num_trials x 1 surprise from Baye learning model model_type - regression model Output: Time-series of log model evidence/Negative free energy from VI on Bayesian model """ # Normalize the data and regressor to lie within 0, 1 y_std = normalize(y_elec[:, int_point]) surprise_reg_std = normalize(surprise_reg) # Select specific model OLS/Hierarchical if model_type == "OLS": model = OLS_model(y_std, bad_trials, surprise_reg_std) elif model_type == "Hierarchical": model = Hierarchical_model(y_std, bad_trials, surprise_reg_std) elif model_type == "Bayesian-MLP": model = Bayesian_NN(y_std, bad_trials, surprise_reg_std) elif model_type == "Null": model = Null_model(y_std, bad_trials) else: raise "Provide a valid model type" # Run the Variational Inference scheme with ADVI # ADVI - Automatic Differentiation VI with model: inference = pm.ADVI() approx = pm.fit( method=inference, callbacks=[ pm.callbacks.CheckParametersConvergence(diff='absolute') ], n=30000, progressbar=0) # return full optimization trace of free energy return -approx.hist
def preproc_surprisal(SP, AP, TP, norm): time = SP["time"] hidden = SP["hidden"] sequence = SP["sequence"] sequence[sequence == 2] = 0.5 hidden[hidden == 2] = 0.5 catch_id = np.argwhere(sequence == 0.5) if norm: PS = [ normalize(SP["predictive_surprise"], catch_id), normalize(AP["predictive_surprise"], catch_id), normalize(TP["predictive_surprise"], catch_id) ] BS = [ normalize(SP["bayesian_surprise"], catch_id), normalize(AP["bayesian_surprise"], catch_id), normalize(TP["bayesian_surprise"], catch_id) ] CS = [ normalize(SP["confidence_corrected_surprise"], catch_id), normalize(AP["confidence_corrected_surprise"], catch_id), normalize(TP["confidence_corrected_surprise"], catch_id) ] else: PS = [ SP["predictive_surprise"], AP["predictive_surprise"], TP["predictive_surprise"] ] BS = [ SP["bayesian_surprise"], AP["bayesian_surprise"], TP["bayesian_surprise"] ] CS = [ SP["confidence_corrected_surprise"], AP["confidence_corrected_surprise"], TP["confidence_corrected_surprise"] ] return time, hidden, sequence, PS, BS, CS
def process(cls, task): process_type = task['process_type'] if process_type == ProcessType.XPATH: mapping = cls.process_xpath(task['content'], task['rules']) elif process_type == ProcessType.CSS_SELECT: mapping = cls.process_css_select(task['content'], task['rules']) elif process_type == ProcessType.JSON: mapping = cls.process_json(task['content'], task['rules']) elif process_type == ProcessType.AUTO_MATCH: mapping = cls.process_auto_match(task['content'], task['rules']) mapping['content'] = normalize(task['content']) else: raise Exception # 通知 scheduler 进行后续链接爬取 valid_links = extract_valid_links(task['content'], task['valid_link_regex'], task['domain']) result = cls.prepare_result(task['proj_id'], task['url'], task['catalog'], task['domain'], task['task_id'], mapping, valid_links) return result
stopwords = helpers.get_stopwords(stopwords_file) docs = helpers.get_docs(dataset_path) corpus = [] for doc in docs: with open(doc, mode='r') as f: text = f.read() words = textprocessing.preprocess_text(text, stopwords) bag_of_words = Counter(words) corpus.append(bag_of_words) idf = helpers.compute_idf(corpus) for doc in corpus: helpers.compute_weights(idf, doc) helpers.normalize(doc) inverted_index = helpers.build_inverted_index(idf, corpus) docs_file = os.path.join(data_path, 'docs.pickle') inverted_index_file = os.path.join(data_path, 'inverted_index.pickle') dictionary_file = os.path.join(data_path, 'dictionary.txt') # Serialize data with open(docs_file, 'wb') as f: pickle.dump(docs, f) with open(inverted_index_file, 'wb') as f: pickle.dump(inverted_index, f) with open(dictionary_file, 'w') as f:
stopwords = helpers.get_stopwords(stopwords_file) dictionary = set(inverted_index.keys()) # Get query from command line query = sys.argv[1] # Preprocess query query = textprocessing.preprocess_text(query, stopwords) query = [word for word in query if word in dictionary] query = Counter(query) # Compute weights for words in query for word, value in query.items(): query[word] = inverted_index[word]['idf'] * (1 + math.log(value)) helpers.normalize(query) scores = [[i, 0] for i in range(len(docs))] for word, value in query.items(): for doc in inverted_index[word]['postings_list']: index, weight = doc scores[index][1] += value * weight scores.sort(key=lambda doc: doc[1], reverse=True) print('----- Results ------ ') for index, score in enumerate(scores): if score[1] == 0: break print('{}. {} - {}'.format(index + 1, docs[score[0]], score[1]))
def index(): if request.method == 'POST': file = request.files['query_data'] filename = secure_filename(file.filename) f = filename.split('.') if f[1] == 'jpg': img = Image.open(file.stream) # PIL image uploaded_img_path = "static/uploaded/" + datetime.now().isoformat() + "_" + file.filename img.save(uploaded_img_path) query = fe.extract(img) dists = np.linalg.norm(features - query, axis=1) # Do search ids = np.argsort(dists)[:30] # Top 30 results scores = [(dists[id], img_paths[id]) for id in ids] return render_template('index.html', query_path=uploaded_img_path, scores=scores) else: query = file.read().decode("utf-8") docs_file = os.path.join(os.getcwd(), 'data', 'docs.pickle') inverted_index_file = os.path.join( os.getcwd(), 'data', 'inverted_index.pickle') stopwords_file = os.path.join(os.getcwd(), 'resources', 'stopwords_en.txt') # Deserialize data with open(docs_file, 'rb') as f: docs = pickle.load(f) with open(inverted_index_file, 'rb') as f: inverted_index = pickle.load(f) stopwords = helpers.get_stopwords(stopwords_file) dictionary = set(inverted_index.keys()) # Get query from command line # Preprocess query query = textprocessing.preprocess_text(query, stopwords) query = [word for word in query if word in dictionary] query = Counter(query) # Compute weights for words in query for word, value in query.items(): query[word] = inverted_index[word]['idf'] * (1 + math.log(value)) helpers.normalize(query) scores = [[i, 0] for i in range(len(docs))] for word, value in query.items(): for doc in inverted_index[word]['postings_list']: index, weight = doc scores[index][1] += value * weight scores.sort(key=lambda doc: doc[1], reverse=True) all_docs = [] all_scores = [] for index, score in enumerate(scores): if score[1] == 0: break all_docs.append(docs[score[0]]) all_scores.append(score[1]) return render_template('docindex.html', query_path=secure_filename(file.filename), docs=zip(all_docs,all_scores)) else: return render_template('index.html')