Example #1
0
def load_moby_dick_analysis():
    
    tokens = get_moby_dick_tokens()
    text = gutenberg.raw('melville-moby_dick.txt')
    try:
        moby_dick_doc = Document(
            url='gutenberg',
            name='moby dick',
            text=text,
            month='Jan',
            year='1851'
            )
        odm_session.flush()
    except DuplicateKeyError:
        moby_dick_doc = Document.query.get(name='moby dick')

    for sum_threshold in sum_thresholds:
        log.info("Trying analysis for threshold = %s" % sum_threshold)
        analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
        anal_dict = analysis.encode()
        window_size = anal_dict['window_size']

        log.debug("Best result = %s" % window_size)
        InformationValueResult(
            window_size = window_size,
            threshold = sum_threshold,
            document = moby_dick_doc,
            iv_words = anal_dict['top_words'],
            max_iv = anal_dict['max_iv'],
            sum_iv = anal_dict['sum_iv']
        )
        odm_session.flush()
Example #2
0
def get_analysis_of_the_mind_document():
    with file("tests/analysis_of_the_mind.txt") as f:
        document = Document(url='tests/analysis_of_the_mind.txt',
                            name='Analysis of the mind',
                            text=f.read(),
                            month='Dec',
                            year='1921')
        raw_text = f.read()

        def tokenizer_wrapper(raw_text):
            return map(
                str.lower,
                tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

        document.tokenizer = tokenizer_wrapper
        odm_session.flush()
        return document
Example #3
0
def get_origin_of_species_document():
    with file("tests/origin.txt") as f:
        document = Document(url='tests/origin.txt',
                            name='Origin of species',
                            text=f.read(),
                            month='Nov',
                            year='1859')
        raw_text = f.read()

        def tokenizer_wrapper(raw_text):
            return map(
                str.lower,
                tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

        document.tokenizer = tokenizer_wrapper
        odm_session.flush()
        return document
Example #4
0
def wget(url):
    log.info('Trying to fetch url: %s' % url)
    response = requests.get(url)
    Document(
        url=url,
        name=url,
        text=response.text,
    )
    odm_session.flush()
def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(
        url = 'melville-moby_dick.txt',
        name = 'Moby dick',
        text = moby_dick,
        month = 'Oct',
        year = 1851
    )
    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))
    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document
Example #6
0
def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(url='melville-moby_dick.txt',
                        name='Moby dick',
                        text=moby_dick,
                        month='Oct',
                        year=1851)

    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(
            str.lower,
            map(str,
                tokenize(raw_text, only_alphanum=True,
                         clean_punctuation=True)))

    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document
Example #7
0
 def process_item(self, item, spider):
     Document(
             name=item['name'],
             url=item['url'],
             text=item['text'],
             month=item['month'],
             year=int(item['year']),
             )
     try:
         odm_session.flush()
     except DuplicateKeyError:
         log.msg('Duplicate found', level=log.WARNING)
         return
     log.msg('Document saved', level=log.INFO)
     return item
Example #8
0
def populate_database():
    log.info("Populating database...")
    with file('lenin_work.json') as f:
        raw_works = json.load(f)
        log.info("Inserting %s works to database..." % (len(raw_works)))
        for raw_work in raw_works:
            try:
                Document(url=raw_work['url'],
                         text=raw_work['text'],
                         name=raw_work['name'],
                         month=raw_work['month'],
                         year=raw_work['year'])
                odm_session.flush()
            except DuplicateKeyError:
                log.info("Duplicate found skipping...")
        log.info("Done")