Ejemplo n.º 1
0
        for term in terms:
            if term not in term_idf:
                term_idf[term] = []
            term_idf[term].append(term)
    return term_idf

if __name__=='__main__':

    if len(sys.argv) > 1:
        input_dir = sys.argv[1]
    else:
        input_dir = 'output/en_tf/'

    config = json_io.read_json('config.json')[u'database']
    doc_hash = json_io.read_json('output/doc_hash.json')
    document_list = get_docs_list(input_dir)

    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")

    try:
        term_id = mydb.select('SELECT id FROM terms order by id desc limit 1;')[0][0]
    except IndexError, e:
        term_id = 1

    #Get idf
    term_doc_list = idf(input_dir)
    term_hash= {}
    doc_number = len(document_list)
    for term, doc in term_doc_list.iteritems():
        term = term.replace("'", "")
    return zh_en_group, en, zh


def lan_output(foldr, file_name, content):
    with open(foldr+file_name, 'w') as f:
        f.write(content.encode('utf-8'))

if __name__=='__main__':
    #result = split_zh_en(sys.argv[1].decode('utf-8'))
    if len(sys.argv) >= 4:
        doc_input = sys.argv[1]
        en_output = sys.argv[2]
        zh_output = sys.argv[3]
    else:
        doc_input = 'output/processed_data/'
        en_output = 'output/en_doc/'
        zh_output = 'output/zh_doc/'


    document_list = get_docs_list(doc_input)
    for doc in document_list:
        doc_id = 1
        doc_obj = Document(doc_id, doc, doc_input)
        for line in doc_obj.get_lines():
            result, en, zh = split_zh_en(line.decode('utf-8'))
            lan_output(en_output, doc, en) 
            lan_output(zh_output, doc, zh) 
        del doc_obj
        doc_id += 1
Ejemplo n.º 3
0
import sys
import os

from my_class.Document import Document
from my_class.Tokenizer import Tokenizer
from doc_preprocessing import get_docs_list
from modules import json_io
from modules import csv_io

if __name__ == '__main__':
    if len(sys.argv) > 1:
        doc_input = sys.argv[1]
    else:
        doc_input = 'output/en_doc/'

    document_list = get_docs_list(doc_input)
    tokenizer = Tokenizer()
    doc_id = 1
    for doc in document_list:
        doc_obj = Document(doc_id, doc, doc_input)
        # tokenize
        normalize_tokens = []
        for line in doc_obj.get_lines():
            tokens = tokenizer.to_tokens(line.decode('utf-8'))
            for token in tokens:
                if tokenizer.is_stop_word(token):
                    token = ""
                elif token.isdigit():
                    normalize_tokens.append(token.encode('utf-8'))
                else:
                    token = tokenizer.stemming(token)
Ejemplo n.º 4
0
from my_class.Document import Document
from modules import json_io
from doc_preprocessing import get_docs_list


if __name__=='__main__':
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
        config = sys.argv[1]
    else:
        data_dir = 'output/processed_data/'
        config = json_io.read_json('config.json')[u'database']

    doc_hash = json_io.read_json('output/doc_hash.json')

    document_list = get_docs_list(data_dir)
    
    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")
  
    table_name = "documents"
    key_list = ['doc_id', 'content']

    for doc in document_list:
        doc_obj = Document(doc_hash[doc], doc, data_dir)
        content = doc_obj.read().replace("'", '"')
        data_list = [str(doc_hash[doc]),  content]

        mydb.insert_data(table_name, key_list, data_list)
        del doc_obj
    mydb.close()
Ejemplo n.º 5
0
from my_class.DataDB import DataDB
from my_class.Document import Document
from modules import json_io
from doc_preprocessing import get_docs_list

if __name__ == '__main__':
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
        config = sys.argv[1]
    else:
        data_dir = 'output/processed_data/'
        config = json_io.read_json('config.json')[u'database']

    doc_hash = json_io.read_json('output/doc_hash.json')

    document_list = get_docs_list(data_dir)

    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")

    table_name = "documents"
    key_list = ['doc_id', 'content']

    for doc in document_list:
        doc_obj = Document(doc_hash[doc], doc, data_dir)
        content = doc_obj.read().replace("'", '"')
        data_list = [str(doc_hash[doc]), content]

        mydb.insert_data(table_name, key_list, data_list)
        del doc_obj
    mydb.close()
Ejemplo n.º 6
0
    json_io.write_json(output_path + doc + '.json', term_tf)


def to_db(mydb, term_id, document_list, doc_hash, input_dir):
    for doc in document_list:
        terms_tf = json_io.read_json(input_dir + doc)
        for term, tf in terms_tf.iteritems():
            term = term.replace("'", "")
            if len(term) > 255:
                term = term[:254]
            sql = "INSERT INTO doc_lookups (doc_id,title,tf,term_id) VALUES (" \
                    + "'" + str(doc_hash[doc[:-5]]) + "','" + doc[:-5]  + "','" + str(tf) + "','" + str(term_id[term]) + "');"
            mydb.exe_sql(sql)


if __name__ == '__main__':
    if len(sys.argv) > 1:
        doc_hash = csv_io.read_csv('output/doc_hash.json')
        input_dir = sys.argv[1]
        output_dir = 'output/zh_tf/'
    else:
        doc_hash = csv_io.read_csv('output/doc_hash.json')
        input_dir = 'output/en_tokens/'
        output_dir = 'output/en_tf/'

    document_list = get_docs_list(input_dir)
    for doc in document_list:
        terms = csv_io.read_csv(input_dir + doc)
        tf(terms, output_dir)