return zh_en_group, en, zh def lan_output(foldr, file_name, content): with open(foldr+file_name, 'w') as f: f.write(content.encode('utf-8')) if __name__=='__main__': #result = split_zh_en(sys.argv[1].decode('utf-8')) if len(sys.argv) >= 4: doc_input = sys.argv[1] en_output = sys.argv[2] zh_output = sys.argv[3] else: doc_input = 'output/processed_data/' en_output = 'output/en_doc/' zh_output = 'output/zh_doc/' document_list = get_docs_list(doc_input) for doc in document_list: doc_id = 1 doc_obj = Document(doc_id, doc, doc_input) for line in doc_obj.get_lines(): result, en, zh = split_zh_en(line.decode('utf-8')) lan_output(en_output, doc, en) lan_output(zh_output, doc, zh) del doc_obj doc_id += 1
from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token) normalize_tokens.append(token.encode('utf-8')) csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens]) del doc_obj doc_id += 1