Python Document.get_lines Examples

Programming Language: Python

Namespace/Package Name: my_class.Document

Class/Type: Document

Method/Function: get_lines

Examples at hotexamples.com: 2

Python Document.get_lines - 2 examples found. These are the top rated real world Python examples of my_class.Document.Document.get_lines extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Document(3)

read(2)

get_lines(1)

Frequently Used Methods

Document (3)

read (2)

get_lines (1)

Example #1

Show file

File: language_processing.py Project: WayWingsDev/search-engine

    return zh_en_group, en, zh


def lan_output(foldr, file_name, content):
    with open(foldr+file_name, 'w') as f:
        f.write(content.encode('utf-8'))

if __name__=='__main__':
    #result = split_zh_en(sys.argv[1].decode('utf-8'))
    if len(sys.argv) >= 4:
        doc_input = sys.argv[1]
        en_output = sys.argv[2]
        zh_output = sys.argv[3]
    else:
        doc_input = 'output/processed_data/'
        en_output = 'output/en_doc/'
        zh_output = 'output/zh_doc/'


    document_list = get_docs_list(doc_input)
    for doc in document_list:
        doc_id = 1
        doc_obj = Document(doc_id, doc, doc_input)
        for line in doc_obj.get_lines():
            result, en, zh = split_zh_en(line.decode('utf-8'))
            lan_output(en_output, doc, en) 
            lan_output(zh_output, doc, zh) 
        del doc_obj
        doc_id += 1

Example #2

Show file

File: text_processing.py Project: wsxbjx/search-engine

from doc_preprocessing import get_docs_list
from modules import json_io
from modules import csv_io

if __name__ == '__main__':
    if len(sys.argv) > 1:
        doc_input = sys.argv[1]
    else:
        doc_input = 'output/en_doc/'

    document_list = get_docs_list(doc_input)
    tokenizer = Tokenizer()
    doc_id = 1
    for doc in document_list:
        doc_obj = Document(doc_id, doc, doc_input)
        # tokenize
        normalize_tokens = []
        for line in doc_obj.get_lines():
            tokens = tokenizer.to_tokens(line.decode('utf-8'))
            for token in tokens:
                if tokenizer.is_stop_word(token):
                    token = ""
                elif token.isdigit():
                    normalize_tokens.append(token.encode('utf-8'))
                else:
                    token = tokenizer.stemming(token)
                    normalize_tokens.append(token.encode('utf-8'))
        csv_io.write_csv('output/en_tokens/' + doc, [normalize_tokens])
        del doc_obj
        doc_id += 1