def init_docs(document_list, doc_foldr='data/'): doc_id = 1 documents = [] doc_hash = {} id_hash = {} for doc in document_list: documents.append(Document(doc_id, doc, doc_foldr)) doc_hash[doc] = doc_id id_hash[doc_id] = doc doc_id += 1 json_io.write_json('output/doc_hash.json', doc_hash) json_io.write_json('output/id_hash.json', id_hash) return documents
return zh_en_group, en, zh def lan_output(foldr, file_name, content): with open(foldr+file_name, 'w') as f: f.write(content.encode('utf-8')) if __name__=='__main__': #result = split_zh_en(sys.argv[1].decode('utf-8')) if len(sys.argv) >= 4: doc_input = sys.argv[1] en_output = sys.argv[2] zh_output = sys.argv[3] else: doc_input = 'output/processed_data/' en_output = 'output/en_doc/' zh_output = 'output/zh_doc/' document_list = get_docs_list(doc_input) for doc in document_list: doc_id = 1 doc_obj = Document(doc_id, doc, doc_input) for line in doc_obj.get_lines(): result, en, zh = split_zh_en(line.decode('utf-8')) lan_output(en_output, doc, en) lan_output(zh_output, doc, zh) del doc_obj doc_id += 1
from my_class.Tokenizer import Tokenizer from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8')) csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens]) del doc_obj doc_id += 1
from modules import json_io from doc_preprocessing import get_docs_list if __name__=='__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- import sys from my_class.Document import Document from doc_preprocessing import get_docs_list from modules import csv_io def n_gram(content, n): tokens = [] for i in range(len(content)-n+1): tokens.append(content[i:i+n].encode('utf-8')) return tokens if __name__=='__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/zh_doc/' document_list = get_docs_list(doc_input) doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) content = doc_obj.read().decode('utf-8') tokens = n_gram(content, 2) csv_io.write_csv('output/zh_tokens/' + doc, [tokens]) del doc_obj doc_id += 1
# -*- coding: utf-8 -*- import sys from my_class.Document import Document from doc_preprocessing import get_docs_list from modules import csv_io def n_gram(content, n): tokens = [] for i in range(len(content) - n + 1): tokens.append(content[i:i + n].encode('utf-8')) return tokens if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/zh_doc/' document_list = get_docs_list(doc_input) doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) content = doc_obj.read().decode('utf-8') tokens = n_gram(content, 2) csv_io.write_csv('output/zh_tokens/' + doc, [tokens]) del doc_obj doc_id += 1
from my_class.Document import Document from modules import json_io from doc_preprocessing import get_docs_list if __name__ == '__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()