コード例 #1
0
from modules import json_io
from doc_preprocessing import get_docs_list


if __name__=='__main__':
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
        config = sys.argv[1]
    else:
        data_dir = 'output/processed_data/'
        config = json_io.read_json('config.json')[u'database']

    doc_hash = json_io.read_json('output/doc_hash.json')

    document_list = get_docs_list(data_dir)
    
    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")
  
    table_name = "documents"
    key_list = ['doc_id', 'content']

    for doc in document_list:
        doc_obj = Document(doc_hash[doc], doc, data_dir)
        content = doc_obj.read().replace("'", '"')
        data_list = [str(doc_hash[doc]),  content]

        mydb.insert_data(table_name, key_list, data_list)
        del doc_obj
    mydb.close()
コード例 #2
0
ファイル: zh_n_gram.py プロジェクト: wsxbjx/search-engine
# -*- coding: utf-8 -*-

import sys
from my_class.Document import Document
from doc_preprocessing import get_docs_list
from modules import csv_io


def n_gram(content, n):
    tokens = []
    for i in range(len(content) - n + 1):
        tokens.append(content[i:i + n].encode('utf-8'))
    return tokens


if __name__ == '__main__':
    if len(sys.argv) > 1:
        doc_input = sys.argv[1]
    else:
        doc_input = 'output/zh_doc/'

    document_list = get_docs_list(doc_input)
    doc_id = 1
    for doc in document_list:
        doc_obj = Document(doc_id, doc, doc_input)
        content = doc_obj.read().decode('utf-8')
        tokens = n_gram(content, 2)
        csv_io.write_csv('output/zh_tokens/' + doc, [tokens])
        del doc_obj
        doc_id += 1
コード例 #3
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

import sys
from my_class.Document import Document
from doc_preprocessing import get_docs_list
from modules import csv_io

def n_gram(content, n):
    tokens = []
    for i in range(len(content)-n+1):
        tokens.append(content[i:i+n].encode('utf-8'))
    return tokens

if __name__=='__main__':
    if len(sys.argv) > 1: 
        doc_input = sys.argv[1]
    else:
        doc_input = 'output/zh_doc/'

    document_list = get_docs_list(doc_input)
    doc_id = 1
    for doc in document_list:
        doc_obj = Document(doc_id, doc, doc_input)
        content = doc_obj.read().decode('utf-8')
        tokens = n_gram(content, 2)
        csv_io.write_csv('output/zh_tokens/' + doc, [tokens])
        del doc_obj
        doc_id += 1

コード例 #4
0
ファイル: doc_to_db.py プロジェクト: wsxbjx/search-engine
from my_class.Document import Document
from modules import json_io
from doc_preprocessing import get_docs_list

if __name__ == '__main__':
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
        config = sys.argv[1]
    else:
        data_dir = 'output/processed_data/'
        config = json_io.read_json('config.json')[u'database']

    doc_hash = json_io.read_json('output/doc_hash.json')

    document_list = get_docs_list(data_dir)

    mydb = DataDB( config[u'dbtype'], config[u'host'], config[u'dbname'], \
            config[u'username'], config[u'password'], config[u'encoding'], "")

    table_name = "documents"
    key_list = ['doc_id', 'content']

    for doc in document_list:
        doc_obj = Document(doc_hash[doc], doc, data_dir)
        content = doc_obj.read().replace("'", '"')
        data_list = [str(doc_hash[doc]), content]

        mydb.insert_data(table_name, key_list, data_list)
        del doc_obj
    mydb.close()