def _get_window_size_analysis(window_size): try: document = __document log.info("Checking window_size = %s" % window_size) #check if result already exists if InformationValueResult.query.find({ "window_size": window_size, "document_id": document._id }).count() > 0: log.warning('Result already found on database') return iv_words = __information_value_calculator.information_value( window_size) try: InformationValueResult(window_size=window_size, document_id=document._id, iv_words=iv_words) log.info("Storing results for document %s, window_size %s" % (document.name, window_size)) odm_session.flush() except DuplicateKeyError: log.warning('Result already found') return (window_size, WindowAnalysis(window_size, iv_words, number_of_words=__number_of_words)) except WindowSizeTooLarge: return None
def load_moby_dick_analysis(): tokens = get_moby_dick_tokens() text = gutenberg.raw('melville-moby_dick.txt') try: moby_dick_doc = Document( url='gutenberg', name='moby dick', text=text, month='Jan', year='1851' ) odm_session.flush() except DuplicateKeyError: moby_dick_doc = Document.query.get(name='moby dick') for sum_threshold in sum_thresholds: log.info("Trying analysis for threshold = %s" % sum_threshold) analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1] anal_dict = analysis.encode() window_size = anal_dict['window_size'] log.debug("Best result = %s" % window_size) InformationValueResult( window_size = window_size, threshold = sum_threshold, document = moby_dick_doc, iv_words = anal_dict['top_words'], max_iv = anal_dict['max_iv'], sum_iv = anal_dict['sum_iv'] ) odm_session.flush()
def wget(url): log.info('Trying to fetch url: %s' % url) response = requests.get(url) Document( url=url, name=url, text=response.text, ) odm_session.flush()
def _delete_non_best_analysis(document, window_sizes): best_res = document.get_information_value_result() log.info("Best window size for %s: %s" % (document.name, best_res.window_size)) log.info("Removing other results from mongo") for one_res in document.results: if one_res.window_size!=best_res.window_size: log.info("Removing window size %s" % one_res.window_size) one_res.delete() log.info("Flushing") odm_session.flush()
def _delete_non_best_analysis(document, window_sizes): best_res = document.get_information_value_result() log.info("Best window size for %s: %s" % (document.name, best_res.window_size)) log.info("Removing other results from mongo") for one_res in document.results: if one_res.window_size != best_res.window_size: log.info("Removing window size %s" % one_res.window_size) one_res.delete() log.info("Flushing") odm_session.flush()
def process_item(self, item, spider): Document( name=item['name'], url=item['url'], text=item['text'], month=item['month'], year=int(item['year']), ) try: odm_session.flush() except DuplicateKeyError: log.msg('Duplicate found', level=log.WARNING) return log.msg('Document saved', level=log.INFO) return item
def populate_database(): log.info("Populating database...") with file('lenin_work.json') as f: raw_works = json.load(f) log.info("Inserting %s works to database..." % (len(raw_works))) for raw_work in raw_works: try: Document(url=raw_work['url'], text=raw_work['text'], name=raw_work['name'], month=raw_work['month'], year=raw_work['year']) odm_session.flush() except DuplicateKeyError: log.info("Duplicate found skipping...") log.info("Done")
def get_analysis_of_the_mind_document(): with file("tests/analysis_of_the_mind.txt") as f: document = Document(url='tests/analysis_of_the_mind.txt', name='Analysis of the mind', text=f.read(), month='Dec', year='1921') raw_text = f.read() def tokenizer_wrapper(raw_text): return map( str.lower, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def get_origin_of_species_document(): with file("tests/origin.txt") as f: document = Document(url='tests/origin.txt', name='Origin of species', text=f.read(), month='Nov', year='1859') raw_text = f.read() def tokenizer_wrapper(raw_text): return map( str.lower, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def populate_database(): log.info("Populating database...") with file('lenin_work.json') as f: raw_works = json.load(f) log.info("Inserting %s works to database..." % (len(raw_works))) for raw_work in raw_works: try: Document( url=raw_work['url'], text=raw_work['text'], name=raw_work['name'], month=raw_work['month'], year=raw_work['year'] ) odm_session.flush() except DuplicateKeyError: log.info("Duplicate found skipping...") log.info("Done")
def get_moby_dick_document(): moby_dick = gutenberg.raw('melville-moby_dick.txt') document = Document( url = 'melville-moby_dick.txt', name = 'Moby dick', text = moby_dick, month = 'Oct', year = 1851 ) # document uses tokenizer func for create tokens, since we need to enforce # only_alphanum and clean_punct we need a wrapper def tokenizer_wrapper(raw_text): return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True))) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def get_moby_dick_document(): moby_dick = gutenberg.raw('melville-moby_dick.txt') document = Document(url='melville-moby_dick.txt', name='Moby dick', text=moby_dick, month='Oct', year=1851) # document uses tokenizer func for create tokens, since we need to enforce # only_alphanum and clean_punct we need a wrapper def tokenizer_wrapper(raw_text): return map( str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True))) document.tokenizer = tokenizer_wrapper odm_session.flush() return document
def reset_senses(): log.info("Resetting all senses") for doc in Document.query.find(): doc.related_sense = {} odm_session.flush()