Beispiel #1
0
def _get_window_size_analysis(window_size):
    try:
        document = __document
        log.info("Checking window_size = %s" % window_size)
        #check if result already exists
        if InformationValueResult.query.find({
                "window_size": window_size,
                "document_id": document._id
        }).count() > 0:
            log.warning('Result already found on database')
            return

        iv_words = __information_value_calculator.information_value(
            window_size)
        try:
            InformationValueResult(window_size=window_size,
                                   document_id=document._id,
                                   iv_words=iv_words)
            log.info("Storing results for document %s, window_size %s" %
                     (document.name, window_size))
            odm_session.flush()
        except DuplicateKeyError:
            log.warning('Result already found')
        return (window_size,
                WindowAnalysis(window_size,
                               iv_words,
                               number_of_words=__number_of_words))
    except WindowSizeTooLarge:
        return None
Beispiel #2
0
def load_moby_dick_analysis():
    
    tokens = get_moby_dick_tokens()
    text = gutenberg.raw('melville-moby_dick.txt')
    try:
        moby_dick_doc = Document(
            url='gutenberg',
            name='moby dick',
            text=text,
            month='Jan',
            year='1851'
            )
        odm_session.flush()
    except DuplicateKeyError:
        moby_dick_doc = Document.query.get(name='moby dick')

    for sum_threshold in sum_thresholds:
        log.info("Trying analysis for threshold = %s" % sum_threshold)
        analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
        anal_dict = analysis.encode()
        window_size = anal_dict['window_size']

        log.debug("Best result = %s" % window_size)
        InformationValueResult(
            window_size = window_size,
            threshold = sum_threshold,
            document = moby_dick_doc,
            iv_words = anal_dict['top_words'],
            max_iv = anal_dict['max_iv'],
            sum_iv = anal_dict['sum_iv']
        )
        odm_session.flush()
Beispiel #3
0
def wget(url):
    log.info('Trying to fetch url: %s' % url)
    response = requests.get(url)
    Document(
                url=url,
                name=url,
                text=response.text,
    )
    odm_session.flush()
Beispiel #4
0
def wget(url):
    log.info('Trying to fetch url: %s' % url)
    response = requests.get(url)
    Document(
        url=url,
        name=url,
        text=response.text,
    )
    odm_session.flush()
def _delete_non_best_analysis(document, window_sizes):
  best_res = document.get_information_value_result()
  log.info("Best window size for %s: %s" % (document.name, best_res.window_size))
  log.info("Removing other results from mongo")
  for one_res in document.results:
    if one_res.window_size!=best_res.window_size:
      log.info("Removing window size %s" % one_res.window_size)
      one_res.delete()
  log.info("Flushing")
  odm_session.flush()
Beispiel #6
0
def _delete_non_best_analysis(document, window_sizes):
    best_res = document.get_information_value_result()
    log.info("Best window size for %s: %s" %
             (document.name, best_res.window_size))
    log.info("Removing other results from mongo")
    for one_res in document.results:
        if one_res.window_size != best_res.window_size:
            log.info("Removing window size %s" % one_res.window_size)
            one_res.delete()
    log.info("Flushing")
    odm_session.flush()
Beispiel #7
0
 def process_item(self, item, spider):
     Document(
             name=item['name'],
             url=item['url'],
             text=item['text'],
             month=item['month'],
             year=int(item['year']),
             )
     try:
         odm_session.flush()
     except DuplicateKeyError:
         log.msg('Duplicate found', level=log.WARNING)
         return
     log.msg('Document saved', level=log.INFO)
     return item
Beispiel #8
0
def populate_database():
    log.info("Populating database...")
    with file('lenin_work.json') as f:
        raw_works = json.load(f)
        log.info("Inserting %s works to database..." % (len(raw_works)))
        for raw_work in raw_works:
            try:
                Document(url=raw_work['url'],
                         text=raw_work['text'],
                         name=raw_work['name'],
                         month=raw_work['month'],
                         year=raw_work['year'])
                odm_session.flush()
            except DuplicateKeyError:
                log.info("Duplicate found skipping...")
        log.info("Done")
Beispiel #9
0
def get_analysis_of_the_mind_document():
    with file("tests/analysis_of_the_mind.txt") as f:
        document = Document(url='tests/analysis_of_the_mind.txt',
                            name='Analysis of the mind',
                            text=f.read(),
                            month='Dec',
                            year='1921')
        raw_text = f.read()

        def tokenizer_wrapper(raw_text):
            return map(
                str.lower,
                tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

        document.tokenizer = tokenizer_wrapper
        odm_session.flush()
        return document
Beispiel #10
0
def get_origin_of_species_document():
    with file("tests/origin.txt") as f:
        document = Document(url='tests/origin.txt',
                            name='Origin of species',
                            text=f.read(),
                            month='Nov',
                            year='1859')
        raw_text = f.read()

        def tokenizer_wrapper(raw_text):
            return map(
                str.lower,
                tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

        document.tokenizer = tokenizer_wrapper
        odm_session.flush()
        return document
def populate_database():
    log.info("Populating database...")
    with file('lenin_work.json') as f:
        raw_works = json.load(f)
        log.info("Inserting %s works to database..." % (len(raw_works)))
        for raw_work in raw_works:
            try:
                Document(
                    url=raw_work['url'],
                    text=raw_work['text'],
                    name=raw_work['name'],
                    month=raw_work['month'],
                    year=raw_work['year']
                )
                odm_session.flush()
            except DuplicateKeyError:
                log.info("Duplicate found skipping...")
        log.info("Done")
def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(
        url = 'melville-moby_dick.txt',
        name = 'Moby dick',
        text = moby_dick,
        month = 'Oct',
        year = 1851
    )
    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))
    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document
Beispiel #13
0
def get_moby_dick_document():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    document = Document(url='melville-moby_dick.txt',
                        name='Moby dick',
                        text=moby_dick,
                        month='Oct',
                        year=1851)

    # document uses tokenizer func for create tokens, since we need to enforce
    # only_alphanum and clean_punct we need a wrapper
    def tokenizer_wrapper(raw_text):
        return map(
            str.lower,
            map(str,
                tokenize(raw_text, only_alphanum=True,
                         clean_punctuation=True)))

    document.tokenizer = tokenizer_wrapper

    odm_session.flush()

    return document
def reset_senses():
  log.info("Resetting all senses")
  for doc in Document.query.find():
    doc.related_sense = {}
  odm_session.flush()
Beispiel #15
0
def reset_senses():
    log.info("Resetting all senses")
    for doc in Document.query.find():
        doc.related_sense = {}
    odm_session.flush()