def remove_stop_words(): src_dir = 'category_tokenizer' dst_dir = 'category_non_stop_words' stop_multiple_words = ['bác sỹ'] stop_words = [ 'có', 'em', 'bị', 'và', 'bác_sĩ', 'tôi', 'thì', 'bệnh_viện', 'medlatec', 'cám_ơn', 'cảm_ơn', 'nguyên_nhân', 'hỏi', 'e', 'ạ', 'vâng', 'bs', 'bv', 'nhưng', 'chào', 'là' ] filenames = [] for (dirpath, dirnames, filenames1) in os.walk(src_dir): filenames = filenames1 break for filename in filenames: with open(f'{dst_dir}/{filename}', 'w') as fp1: pass with open(f'{src_dir}/{filename}', 'r') as fp: for line in fp: try: title, q = line.split('\t') except Exception as e: print(e) print(line.split('\t')) print(filename, title, q) dst_title = my_preproc.remove_stop_words( title, stop_multiple_words, stop_words) dst_q = my_preproc.remove_stop_words(q, stop_multiple_words, stop_words) with open(f'{dst_dir}/{filename}', 'a') as fp1: fp1.write(f'{dst_title}\t{dst_q}\n')
def ranked_retrieval(queries, collection_table, doc_nums, inverted_index, stop_words): ranked_scores = {} for query_index, query in enumerate(queries): query_tokens = normalise(remove_stop_words(tokenise(query), stop_words)) # Convert query into an OR boolean search and use eval to evaluate it boolean_vectors = [] for token in query_tokens: boolean_vector = collection_table[token] boolean_vectors.append('np.array([{}])'.format( array_to_string(boolean_vector))) query_eval_string = ' | '.join(boolean_vectors) query_documents = boolean_search(query_eval_string, doc_nums) query_scores = [] # Map query_boolean_result to a list of document ids for doc in query_documents: score = TFIDF(doc, query_tokens, len(doc_nums), inverted_index) query_scores.append((doc, score)) # Sort scores for each query on a descending order query_scores = sorted(query_scores, key=lambda x: x[1], reverse=True) ranked_scores[query_index + 1] = query_scores return ranked_scores
def process(predictions): predictions = np.array([remove_URL(text) for text in predictions]) predictions = np.array([remove_punct(text) for text in predictions]) predictions = np.array([remove_case_white_space(text) for text in predictions]) predictions = np.array([remove_stop_words(text) for text in predictions]) predictions = remove_empty(predictions) # tokenize test data predictions = tokenizer.texts_to_sequences(predictions) # pad test data predictions = pad_sequences( predictions, maxlen=max_seq_length, padding='post', truncating='post' ) return predictions
def build_corpus_dictionary(): input_path = 'dataset/comments_array.json' json_array = preprocess.load_json_file(input_path) field_array = ['content'] str_list = preprocess.extract_from_json(json_array, field_array) texts = preprocess.tokenize(str_list) removed_texts = preprocess.remove_stop_words(texts) dictionary = corpora.Dictionary(texts) corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary) import ipdb; ipdb.set_trace()
def dicts_to_vectors(dicts, explicit_keys=None, remove_stop_words=False): """Convert a list of dictionaries to feature-vectors""" if not explicit_keys: node_set = set() for d in dicts: for node in d.keys(): node_set.add(node) all_tokens = list(node_set) else: all_tokens = explicit_keys if remove_stop_words: all_tokens = preprocess.remove_stop_words(all_tokens) features = np.zeros((len(all_tokens), len(dicts))) for i, d in enumerate(dicts): if i%100==0: print ' vector',str(i)+'/'+str(len(dicts)) features[:,i] = [d.get(token, 0.0) for token in all_tokens] return features
def dicts_to_vectors(dicts, explicit_keys=None, remove_stop_words=False): """Convert a list of dictionaries to feature-vectors""" if not explicit_keys: node_set = set() for d in dicts: for node in d.keys(): node_set.add(node) all_tokens = list(node_set) else: all_tokens = explicit_keys if remove_stop_words: all_tokens = preprocess.remove_stop_words(all_tokens) features = np.zeros((len(all_tokens), len(dicts))) for i, d in enumerate(dicts): if i % 100 == 0: print ' vector', str(i) + '/' + str(len(dicts)) features[:, i] = [d.get(token, 0.0) for token in all_tokens] return features
def main(): input_path = 'dataset/taipei_city.json' json_array = preprocess.load_json_file(input_path) field_array = ['content'] str_list, answer = preprocess.extract_from_json_with_answer(json_array['data'], field_array) texts = preprocess.tokenize(str_list) removed_texts = preprocess.remove_stop_words(texts) #dictionary = pickle.load(open('dictionary.obj', 'rb')) dictionary = corpora.Dictionary(removed_texts) data_corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary) #corpus = pickle.load(open('corpus.obj', 'rb')) result_table = pd.DataFrame() # preprocess with Tfidf model params = {"corpus": data_corpus} X, y = convert_to_X_y(TfidfModel, params, data_corpus, answer) result_table = train_with_dummy(result_table, X, y, 'tfidf') result_table = train_with_random_forest(result_table, X, y, 'tfidf') result_table = train_with_logistic_regression(result_table, X, y, 'tfidf') ''' # preprocess with lda model for num_topics in [10, 50, 100, 150, 200]: params = {"corpus": data_corpus, "num_topics": num_topics} X, y = convert_to_X_y(LdaModel, params, data_corpus, answer) result_table = train_with_dummy(result_table, X, y, 'lda_'+str(params['num_topics'])) result_table = train_with_random_forest(result_table, X, y, 'lda_'+str(params['num_topics'])) result_table = train_with_logistic_regression(result_table, X, y, 'lda_'+str(params['num_topics'])) # preprocess with lsi model for num_topics in [10, 50, 100, 150, 200]: params = {"corpus": data_corpus, "num_topics": num_topics} X, y = convert_to_X_y(LsiModel, params, data_corpus, answer) result_table = train_with_dummy(result_table, X, y, 'lsi_'+str(params['num_topics'])) result_table = train_with_random_forest(result_table, X, y, 'lsi_'+str(params['num_topics'])) result_table = train_with_logistic_regression(result_table, X, y, 'lsi_'+str(params['num_topics'])) ''' output_file = sys.argv[1] result_table.to_csv(output_file, sep='\t')
def preprocess(doc): return stemming(remove_stop_words(tokenise(doc), stop_words))