term_idf[term] = [] term_idf[term].append(term) return term_idf if __name__=='__main__': if len(sys.argv) > 1: input_dir = sys.argv[1] else: input_dir = 'output/en_tf/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(input_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") try: term_id = mydb.select('SELECT id FROM terms order by id desc limit 1;')[0][0] except IndexError, e: term_id = 1 #Get idf term_doc_list = idf(input_dir) term_hash= {} doc_number = len(document_list) for term, doc in term_doc_list.iteritems(): term = term.replace("'", "") if len(term) > 255: term = term[:254]
from my_class.Document import Document from modules import json_io from doc_preprocessing import get_docs_list if __name__ == '__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()
from modules import json_io from doc_preprocessing import get_docs_list if __name__=='__main__': if len(sys.argv) >= 2: data_dir = sys.argv[1] config = sys.argv[1] else: data_dir = 'output/processed_data/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(data_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") table_name = "documents" key_list = ['doc_id', 'content'] for doc in document_list: doc_obj = Document(doc_hash[doc], doc, data_dir) content = doc_obj.read().replace("'", '"') data_list = [str(doc_hash[doc]), content] mydb.insert_data(table_name, key_list, data_list) del doc_obj mydb.close()
term_idf[term].append(term) return term_idf if __name__ == '__main__': if len(sys.argv) > 1: input_dir = sys.argv[1] else: input_dir = 'output/en_tf/' config = json_io.read_json('config.json')[u'database'] doc_hash = json_io.read_json('output/doc_hash.json') document_list = get_docs_list(input_dir) mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \ config[u'username'], config[u'password'], config[u'encoding'], "") try: term_id = mydb.select( 'SELECT id FROM terms order by id desc limit 1;')[0][0] except IndexError, e: term_id = 1 #Get idf term_doc_list = idf(input_dir) term_hash = {} doc_number = len(document_list) for term, doc in term_doc_list.iteritems(): term = term.replace("'", "") if len(term) > 255: term = term[:254]