Python Document Examples

Programming Language: Python

Namespace/Package Name: information_value.models

Class/Type: Document

Examples at hotexamples.com: 8

Python Document - 8 examples found. These are the top rated real world Python examples of information_value.models.Document extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Document(7)

tokenizer(3)

Frequently Used Methods

Document (7)

tokenizer (3)

Example #1

Show file

File: moby_dick.py Project: dataista0/leninanalysis

def load_moby_dick_analysis():
    
    tokens = get_moby_dick_tokens()
    text = gutenberg.raw('melville-moby_dick.txt')
    try:
        moby_dick_doc = Document(
            url='gutenberg',
            name='moby dick',
            text=text,
            month='Jan',
            year='1851'
            )
        odm_session.flush()
    except DuplicateKeyError:
        moby_dick_doc = Document.query.get(name='moby dick')

    for sum_threshold in sum_thresholds:
        log.info("Trying analysis for threshold = %s" % sum_threshold)
        analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
        anal_dict = analysis.encode()
        window_size = anal_dict['window_size']

        log.debug("Best result = %s" % window_size)
        InformationValueResult(
            window_size = window_size,
            threshold = sum_threshold,
            document = moby_dick_doc,
            iv_words = anal_dict['top_words'],
            max_iv = anal_dict['max_iv'],
            sum_iv = anal_dict['sum_iv']
        )
        odm_session.flush()

Example #2

Show file

def get_analysis_of_the_mind_document():
    with file("tests/analysis_of_the_mind.txt") as f:
        document = Document(url='tests/analysis_of_the_mind.txt',
                            name='Analysis of the mind',
                            text=f.read(),
                            month='Dec',
                            year='1921')
        raw_text = f.read()

        def tokenizer_wrapper(raw_text):
            return map(
                str.lower,
                tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

        document.tokenizer = tokenizer_wrapper
        odm_session.flush()
        return document

Example #3

Show file

def get_origin_of_species_document():
    with file("tests/origin.txt") as f:
        document = Document(url='tests/origin.txt',
                            name='Origin of species',
                            text=f.read(),
                            month='Nov',
                            year='1859')
        raw_text = f.read()

        def tokenizer_wrapper(raw_text):
            return map(
                str.lower,
                tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

        document.tokenizer = tokenizer_wrapper
        odm_session.flush()
        return document

Example #4

Show file

File: wget.py Project: dataista0/leninanalysis

def wget(url):
    log.info('Trying to fetch url: %s' % url)
    response = requests.get(url)
    Document(
        url=url,
        name=url,
        text=response.text,
    )
    odm_session.flush()

Example #5

Show file

File: moby_dick_tests.py Project: finiteautomata/leninanalysis

def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(
        url = 'melville-moby_dick.txt',
        name = 'Moby dick',
        text = moby_dick,
        month = 'Oct',
        year = 1851
    )
    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))
    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document

Example #6

Show file

def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(url='melville-moby_dick.txt',
                        name='Moby dick',
                        text=moby_dick,
                        month='Oct',
                        year=1851)

    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(
            str.lower,
            map(str,
                tokenize(raw_text, only_alphanum=True,
                         clean_punctuation=True)))

    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document

Example #7

Show file

File: pipelines.py Project: dataista0/leninanalysis

 def process_item(self, item, spider):
     Document(
             name=item['name'],
             url=item['url'],
             text=item['text'],
             month=item['month'],
             year=int(item['year']),
             )
     try:
         odm_session.flush()
     except DuplicateKeyError:
         log.msg('Duplicate found', level=log.WARNING)
         return
     log.msg('Document saved', level=log.INFO)
     return item

Example #8

Show file

File: database.py Project: dataista0/leninanalysis

def populate_database():
    log.info("Populating database...")
    with file('lenin_work.json') as f:
        raw_works = json.load(f)
        log.info("Inserting %s works to database..." % (len(raw_works)))
        for raw_work in raw_works:
            try:
                Document(url=raw_work['url'],
                         text=raw_work['text'],
                         name=raw_work['name'],
                         month=raw_work['month'],
                         year=raw_work['year'])
                odm_session.flush()
            except DuplicateKeyError:
                log.info("Duplicate found skipping...")
        log.info("Done")