def load_moby_dick_analysis(): tokens = get_moby_dick_tokens() text = gutenberg.raw('melville-moby_dick.txt') try: moby_dick_doc = Document( url='gutenberg', name='moby dick', text=text, month='Jan', year='1851' ) odm_session.flush() except DuplicateKeyError: moby_dick_doc = Document.query.get(name='moby dick') for sum_threshold in sum_thresholds: log.info("Trying analysis for threshold = %s" % sum_threshold) analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1] anal_dict = analysis.encode() window_size = anal_dict['window_size'] log.debug("Best result = %s" % window_size) InformationValueResult( window_size = window_size, threshold = sum_threshold, document = moby_dick_doc, iv_words = anal_dict['top_words'], max_iv = anal_dict['max_iv'], sum_iv = anal_dict['sum_iv'] ) odm_session.flush()
def get_analysis_of_the_mind_document(): with file("tests/analysis_of_the_mind.txt") as f: document = Document(url='tests/analysis_of_the_mind.txt', name='Analysis of the mind', text=f.read(), month='Dec', year='1921') raw_text = f.read() def tokenizer_wrapper(raw_text): return map( str.lower, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def get_origin_of_species_document(): with file("tests/origin.txt") as f: document = Document(url='tests/origin.txt', name='Origin of species', text=f.read(), month='Nov', year='1859') raw_text = f.read() def tokenizer_wrapper(raw_text): return map( str.lower, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def wget(url): log.info('Trying to fetch url: %s' % url) response = requests.get(url) Document( url=url, name=url, text=response.text, ) odm_session.flush()
def get_moby_dick_document(): moby_dick = gutenberg.raw('melville-moby_dick.txt') document = Document( url = 'melville-moby_dick.txt', name = 'Moby dick', text = moby_dick, month = 'Oct', year = 1851 ) # document uses tokenizer func for create tokens, since we need to enforce # only_alphanum and clean_punct we need a wrapper def tokenizer_wrapper(raw_text): return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True))) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def get_moby_dick_document(): moby_dick = gutenberg.raw('melville-moby_dick.txt') document = Document(url='melville-moby_dick.txt', name='Moby dick', text=moby_dick, month='Oct', year=1851) # document uses tokenizer func for create tokens, since we need to enforce # only_alphanum and clean_punct we need a wrapper def tokenizer_wrapper(raw_text): return map( str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True))) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def process_item(self, item, spider): Document( name=item['name'], url=item['url'], text=item['text'], month=item['month'], year=int(item['year']), ) try: odm_session.flush() except DuplicateKeyError: log.msg('Duplicate found', level=log.WARNING) return log.msg('Document saved', level=log.INFO) return item
def populate_database(): log.info("Populating database...") with file('lenin_work.json') as f: raw_works = json.load(f) log.info("Inserting %s works to database..." % (len(raw_works))) for raw_work in raw_works: try: Document(url=raw_work['url'], text=raw_work['text'], name=raw_work['name'], month=raw_work['month'], year=raw_work['year']) odm_session.flush() except DuplicateKeyError: log.info("Duplicate found skipping...") log.info("Done")