def init_docs(document_list, doc_foldr='data/'): doc_id = 1 documents = [] doc_hash = {} id_hash = {} for doc in document_list: documents.append(Document(doc_id, doc, doc_foldr)) doc_hash[doc] = doc_id id_hash[doc_id] = doc doc_id += 1 json_io.write_json('output/doc_hash.json', doc_hash) json_io.write_json('output/id_hash.json', id_hash) return documents
from my_class.Tokenizer import Tokenizer from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8')) csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens]) del doc_obj doc_id += 1
from my_class.Document import Document from modules import json_io from doc_preprocessing import get_docs_list if __name__ == '__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()