term_idf[term] = []
            term_idf[term].append(term)
    return term_idf

if __name__=='__main__':

    if len(sys.argv) > 1:
        input_dir = sys.argv[1]
    else:
        input_dir = 'output/en_tf/'

    config = json_io.read_json('config.json')[u'database']
    doc_hash = json_io.read_json('output/doc_hash.json')
    document_list = get_docs_list(input_dir)

    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")

    try:
        term_id = mydb.select('SELECT id FROM terms order by id desc limit 1;')[0][0]
    except IndexError, e:
        term_id = 1

    #Get idf
    term_doc_list = idf(input_dir)
    term_hash= {}
    doc_number = len(document_list)
    for term, doc in term_doc_list.iteritems():
        term = term.replace("'", "")
        if len(term) > 255:
            term = term[:254]
        
Example #2
0
from my_class.Document import Document
from modules import json_io
from doc_preprocessing import get_docs_list

if __name__ == '__main__':
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
        config = sys.argv[1]
    else:
        data_dir = 'output/processed_data/'
        config = json_io.read_json('config.json')[u'database']

    doc_hash = json_io.read_json('output/doc_hash.json')

    document_list = get_docs_list(data_dir)

    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")

    table_name = "documents"
    key_list = ['doc_id', 'content']

    for doc in document_list:
        doc_obj = Document(doc_hash[doc], doc, data_dir)
        content = doc_obj.read().replace("'", '"')
        data_list = [str(doc_hash[doc]), content]

        mydb.insert_data(table_name, key_list, data_list)
        del doc_obj
    mydb.close()
Example #3
0
from modules import json_io
from doc_preprocessing import get_docs_list


if __name__=='__main__':
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
        config = sys.argv[1]
    else:
        data_dir = 'output/processed_data/'
        config = json_io.read_json('config.json')[u'database']

    doc_hash = json_io.read_json('output/doc_hash.json')

    document_list = get_docs_list(data_dir)
    
    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")
  
    table_name = "documents"
    key_list = ['doc_id', 'content']

    for doc in document_list:
        doc_obj = Document(doc_hash[doc], doc, data_dir)
        content = doc_obj.read().replace("'", '"')
        data_list = [str(doc_hash[doc]),  content]

        mydb.insert_data(table_name, key_list, data_list)
        del doc_obj
    mydb.close()
Example #4
0
            term_idf[term].append(term)
    return term_idf


if __name__ == '__main__':

    if len(sys.argv) > 1:
        input_dir = sys.argv[1]
    else:
        input_dir = 'output/en_tf/'

    config = json_io.read_json('config.json')[u'database']
    doc_hash = json_io.read_json('output/doc_hash.json')
    document_list = get_docs_list(input_dir)

    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")

    try:
        term_id = mydb.select(
            'SELECT id FROM terms order by id desc limit 1;')[0][0]
    except IndexError, e:
        term_id = 1

    #Get idf
    term_doc_list = idf(input_dir)
    term_hash = {}
    doc_number = len(document_list)
    for term, doc in term_doc_list.iteritems():
        term = term.replace("'", "")
        if len(term) > 255:
            term = term[:254]