def main(enron_data_dir): data_dir = '/tmp/enron_index' shutil.rmtree(data_dir, ignore_errors=True) ms = microsearch.Microsearch(data_dir) print("Collecting the emails...") globby = os.path.join(enron_data_dir, '*/*/*/*.') all_emails = glob.glob(globby)[:1200] print("Starting indexing {0} docs...".format(len(all_emails))) start_time = time.time() per_doc_times = index_emails(ms, all_emails, enron_data_dir) time_to_index = time.time() - start_time per_doc_avg = sum(per_doc_times) / len(per_doc_times) print("Indexing complete.") print("Total time taken: {:.03f} seconds".format(time_to_index)) print("Avg time per doc: {:.03f} seconds".format(per_doc_avg)) print("Starting searching...") start_time = time.time() per_search_times = search_emails(ms) time_to_search = time.time() - start_time per_search_avg = sum(per_search_times) / len(per_search_times) print("Searching complete.") print("Total time taken: {:.03f} seconds".format(time_to_search)) print("Avg time per query: {:.03f} seconds".format(per_search_avg))
def setUp(self): super(MicrosearchTestCase, self).setUp() self.base = os.path.join('/tmp', 'microsearch_tests') shutil.rmtree(self.base, ignore_errors=True) self.micro = microsearch.Microsearch(self.base) self.unhashed_micro = UnhashedMicrosearch(self.base)
from django.shortcuts import render from data_contender.urls_data import urls from data_contender.models import ContentderData from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq import microsearch from django.core.paginator import Paginator import timeit # Create your views ms = microsearch.Microsearch('C:/Users/acer/BrainDigit(Inter)/search_data') def get_data(request): template_name = 'complete_download.html' for url in urls: uClient = uReq(url) page_html = uClient.read() uClient.close() #print(contentder_url) print(url) page_soup = soup(page_html, "html.parser") rows = page_soup.find_all("div", {"class": "cRow"}) for row in rows: component_text = '' print(row) html_data = str(row) components = row.find_all('div', {'class': 'editor-component'}) for component in components: try: data_type = component.attrs['data-type']