Exemple #1
0
def remove_stop_words():
    src_dir = 'category_tokenizer'
    dst_dir = 'category_non_stop_words'
    stop_multiple_words = ['bác sỹ']

    stop_words = [
        'có', 'em', 'bị', 'và', 'bác_sĩ', 'tôi', 'thì', 'bệnh_viện',
        'medlatec', 'cám_ơn', 'cảm_ơn', 'nguyên_nhân', 'hỏi', 'e', 'ạ', 'vâng',
        'bs', 'bv', 'nhưng', 'chào', 'là'
    ]

    filenames = []
    for (dirpath, dirnames, filenames1) in os.walk(src_dir):
        filenames = filenames1
        break

    for filename in filenames:
        with open(f'{dst_dir}/{filename}', 'w') as fp1:
            pass

        with open(f'{src_dir}/{filename}', 'r') as fp:
            for line in fp:
                try:
                    title, q = line.split('\t')
                except Exception as e:
                    print(e)
                    print(line.split('\t'))
                    print(filename, title, q)

                dst_title = my_preproc.remove_stop_words(
                    title, stop_multiple_words, stop_words)
                dst_q = my_preproc.remove_stop_words(q, stop_multiple_words,
                                                     stop_words)
                with open(f'{dst_dir}/{filename}', 'a') as fp1:
                    fp1.write(f'{dst_title}\t{dst_q}\n')
Exemple #2
0
def ranked_retrieval(queries, collection_table, doc_nums, inverted_index,
                     stop_words):
    ranked_scores = {}

    for query_index, query in enumerate(queries):
        query_tokens = normalise(remove_stop_words(tokenise(query),
                                                   stop_words))

        # Convert query into an OR boolean search and use eval to evaluate it
        boolean_vectors = []
        for token in query_tokens:
            boolean_vector = collection_table[token]
            boolean_vectors.append('np.array([{}])'.format(
                array_to_string(boolean_vector)))

        query_eval_string = ' | '.join(boolean_vectors)
        query_documents = boolean_search(query_eval_string, doc_nums)

        query_scores = []
        # Map query_boolean_result to a list of document ids
        for doc in query_documents:
            score = TFIDF(doc, query_tokens, len(doc_nums), inverted_index)
            query_scores.append((doc, score))

        # Sort scores for each query on a descending order
        query_scores = sorted(query_scores, key=lambda x: x[1], reverse=True)
        ranked_scores[query_index + 1] = query_scores

    return ranked_scores
Exemple #3
0
def process(predictions):
    predictions = np.array([remove_URL(text) for text in predictions])
    predictions = np.array([remove_punct(text) for text in predictions])
    predictions = np.array([remove_case_white_space(text)
                            for text in predictions])
    predictions = np.array([remove_stop_words(text) for text in predictions])
    predictions = remove_empty(predictions)

    # tokenize test data
    predictions = tokenizer.texts_to_sequences(predictions)
    # pad test data
    predictions = pad_sequences(
        predictions, maxlen=max_seq_length, padding='post', truncating='post'
    )
    return predictions
def build_corpus_dictionary():

    input_path = 'dataset/comments_array.json'
    json_array = preprocess.load_json_file(input_path)
    
    field_array = ['content']
    str_list = preprocess.extract_from_json(json_array, field_array)

    texts = preprocess.tokenize(str_list)
    removed_texts = preprocess.remove_stop_words(texts)
    
    dictionary = corpora.Dictionary(texts)
    corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary)

    import ipdb; ipdb.set_trace()
def dicts_to_vectors(dicts, explicit_keys=None, remove_stop_words=False):
    """Convert a list of dictionaries to feature-vectors"""
    if not explicit_keys:
        node_set = set()
        for d in dicts:
            for node in d.keys():
                node_set.add(node)
        all_tokens = list(node_set)
    else:
        all_tokens = explicit_keys
    if remove_stop_words:
        all_tokens = preprocess.remove_stop_words(all_tokens)
    features = np.zeros((len(all_tokens), len(dicts)))
    for i, d in enumerate(dicts):
        if i%100==0: print '    vector',str(i)+'/'+str(len(dicts))
        features[:,i] = [d.get(token, 0.0) for token in all_tokens]
    return features
def dicts_to_vectors(dicts, explicit_keys=None, remove_stop_words=False):
    """Convert a list of dictionaries to feature-vectors"""
    if not explicit_keys:
        node_set = set()
        for d in dicts:
            for node in d.keys():
                node_set.add(node)
        all_tokens = list(node_set)
    else:
        all_tokens = explicit_keys
    if remove_stop_words:
        all_tokens = preprocess.remove_stop_words(all_tokens)
    features = np.zeros((len(all_tokens), len(dicts)))
    for i, d in enumerate(dicts):
        if i % 100 == 0: print '    vector', str(i) + '/' + str(len(dicts))
        features[:, i] = [d.get(token, 0.0) for token in all_tokens]
    return features
def main():

	input_path = 'dataset/taipei_city.json'
	json_array = preprocess.load_json_file(input_path)

	field_array = ['content']
	str_list, answer = preprocess.extract_from_json_with_answer(json_array['data'], field_array)

	texts = preprocess.tokenize(str_list)
	removed_texts = preprocess.remove_stop_words(texts)
    
	#dictionary = pickle.load(open('dictionary.obj', 'rb'))
	dictionary = corpora.Dictionary(removed_texts)
	data_corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary)

	#corpus = pickle.load(open('corpus.obj', 'rb'))
	
	result_table = pd.DataFrame()

	# preprocess with Tfidf model
	params = {"corpus": data_corpus}
	X, y = convert_to_X_y(TfidfModel, params, data_corpus, answer)
	result_table = train_with_dummy(result_table, X, y, 'tfidf')
	result_table = train_with_random_forest(result_table, X, y, 'tfidf')
	result_table = train_with_logistic_regression(result_table, X, y, 'tfidf')

	'''
	# preprocess with lda model
	for num_topics in [10, 50, 100, 150, 200]:
		params = {"corpus": data_corpus, "num_topics": num_topics}
		X, y = convert_to_X_y(LdaModel, params, data_corpus, answer)
		result_table = train_with_dummy(result_table, X, y, 'lda_'+str(params['num_topics']))
		result_table = train_with_random_forest(result_table, X, y, 'lda_'+str(params['num_topics']))
		result_table = train_with_logistic_regression(result_table, X, y, 'lda_'+str(params['num_topics']))
	
	# preprocess with lsi model
	for num_topics in [10, 50, 100, 150, 200]:
		params = {"corpus": data_corpus, "num_topics": num_topics}
		X, y = convert_to_X_y(LsiModel, params, data_corpus, answer)
		result_table = train_with_dummy(result_table, X, y, 'lsi_'+str(params['num_topics']))
		result_table = train_with_random_forest(result_table, X, y, 'lsi_'+str(params['num_topics']))
		result_table = train_with_logistic_regression(result_table, X, y, 'lsi_'+str(params['num_topics']))

	'''
	output_file = sys.argv[1]
	result_table.to_csv(output_file, sep='\t')
Exemple #8
0
def preprocess(doc):
    return stemming(remove_stop_words(tokenise(doc), stop_words))