for term in terms: if term not in term_idf: term_idf[term] = [] term_idf[term].append(term) return term_idf if __name__=='__main__': if len(sys.argv) > 1: input_dir = sys.argv[1] else: input_dir = 'output/en_tf/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(input_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") try: term_id = mydb.select('SELECT id FROM terms order by id desc limit 1;')[0][0] except IndexError, e: term_id = 1 #Get idf term_doc_list = idf(input_dir) term_hash= {} doc_number = len(document_list) for term, doc in term_doc_list.iteritems(): term = term.replace("'", "")
return zh_en_group, en, zh def lan_output(foldr, file_name, content): with open(foldr+file_name, 'w') as f: f.write(content.encode('utf-8')) if __name__=='__main__': #result = split_zh_en(sys.argv[1].decode('utf-8')) if len(sys.argv) >= 4: doc_input = sys.argv[1] en_output = sys.argv[2] zh_output = sys.argv[3] else: doc_input = 'output/processed_data/' en_output = 'output/en_doc/' zh_output = 'output/zh_doc/' document_list = get_docs_list(doc_input) for doc in document_list: doc_id = 1 doc_obj = Document(doc_id, doc, doc_input) for line in doc_obj.get_lines(): result, en, zh = split_zh_en(line.decode('utf-8')) lan_output(en_output, doc, en) lan_output(zh_output, doc, zh) del doc_obj doc_id += 1
import sys import os from my_class.Document import Document from my_class.Tokenizer import Tokenizer from doc_preprocessing import get_docs_list from modules import json_io from modules import csv_io if __name__ == '__main__': if len(sys.argv) > 1: doc_input = sys.argv[1] else: doc_input = 'output/en_doc/' document_list = get_docs_list(doc_input) tokenizer = Tokenizer() doc_id = 1 for doc in document_list: doc_obj = Document(doc_id, doc, doc_input) # tokenize normalize_tokens = [] for line in doc_obj.get_lines(): tokens = tokenizer.to_tokens(line.decode('utf-8')) for token in tokens: if tokenizer.is_stop_word(token): token = "" elif token.isdigit(): normalize_tokens.append(token.encode('utf-8')) else: token = tokenizer.stemming(token)
from my_class.Document import Document from modules import json_io from doc_preprocessing import get_docs_list if __name__=='__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()
from my_class.DataDB import DataDB from my_class.Document import Document from modules import json_io from doc_preprocessing import get_docs_list if __name__ == '__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()
json_io.write_json(output_path + doc + '.json', term_tf) def to_db(mydb, term_id, document_list, doc_hash, input_dir): for doc in document_list: terms_tf = json_io.read_json(input_dir + doc) for term, tf in terms_tf.iteritems(): term = term.replace("'", "") if len(term) > 255: term = term[:254] sql = "INSERT INTO doc_lookups (doc_id,title,tf,term_id) VALUES (" \ + "'" + str(doc_hash[doc[:-5]]) + "','" + doc[:-5] + "','" + str(tf) + "','" + str(term_id[term]) + "');" mydb.exe_sql(sql) if __name__ == '__main__': if len(sys.argv) > 1: doc_hash = csv_io.read_csv('output/doc_hash.json') input_dir = sys.argv[1] output_dir = 'output/zh_tf/' else: doc_hash = csv_io.read_csv('output/doc_hash.json') input_dir = 'output/en_tokens/' output_dir = 'output/en_tf/' document_list = get_docs_list(input_dir) for doc in document_list: terms = csv_io.read_csv(input_dir + doc) tf(terms, output_dir)