コード例 #1
0
ファイル: enron_bench.py プロジェクト: scarsman/microsearch
def main(enron_data_dir):
    data_dir = '/tmp/enron_index'
    shutil.rmtree(data_dir, ignore_errors=True)
    ms = microsearch.Microsearch(data_dir)

    print("Collecting the emails...")
    globby = os.path.join(enron_data_dir, '*/*/*/*.')
    all_emails = glob.glob(globby)[:1200]

    print("Starting indexing {0} docs...".format(len(all_emails)))
    start_time = time.time()
    per_doc_times = index_emails(ms, all_emails, enron_data_dir)
    time_to_index = time.time() - start_time

    per_doc_avg = sum(per_doc_times) / len(per_doc_times)

    print("Indexing complete.")
    print("Total time taken: {:.03f} seconds".format(time_to_index))
    print("Avg time per doc: {:.03f} seconds".format(per_doc_avg))

    print("Starting searching...")
    start_time = time.time()
    per_search_times = search_emails(ms)
    time_to_search = time.time() - start_time

    per_search_avg = sum(per_search_times) / len(per_search_times)

    print("Searching complete.")
    print("Total time taken: {:.03f} seconds".format(time_to_search))
    print("Avg time per query: {:.03f} seconds".format(per_search_avg))
コード例 #2
0
    def setUp(self):
        super(MicrosearchTestCase, self).setUp()
        self.base = os.path.join('/tmp', 'microsearch_tests')
        shutil.rmtree(self.base, ignore_errors=True)

        self.micro = microsearch.Microsearch(self.base)
        self.unhashed_micro = UnhashedMicrosearch(self.base)
コード例 #3
0
from django.shortcuts import render
from data_contender.urls_data import urls
from data_contender.models import ContentderData
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import microsearch
from django.core.paginator import Paginator
import timeit
# Create your views
ms = microsearch.Microsearch('C:/Users/acer/BrainDigit(Inter)/search_data')


def get_data(request):
    template_name = 'complete_download.html'
    for url in urls:
        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()
        #print(contentder_url)
        print(url)

        page_soup = soup(page_html, "html.parser")
        rows = page_soup.find_all("div", {"class": "cRow"})
        for row in rows:
            component_text = ''
            print(row)
            html_data = str(row)
            components = row.find_all('div', {'class': 'editor-component'})
            for component in components:
                try:
                    data_type = component.attrs['data-type']