コード例 #1
0
def createSearchableData():   
    charmap = charset_table_to_dict(default_charset)
    custom_analyzers = StemmingAnalyzer() | CharsetFilter(charmap)
    
    schema = Schema(title=TEXT(stored= True, field_boost=3.0),
                                 ID= ID(stored=True, unique=True), 
                                 url= TEXT(stored=True), 
                                 textdata= TEXT(stored=True, analyzer= custom_analyzers, field_boost=0.8))
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    ix = create_in("indexdir",schema)
    writer = ix.writer()

    path = os.path.relpath("/dump/dump_grande.xml", start="/")
    root = ET.parse(path)
    xml_data = {}
    for item in root.iter():
        if item.tag == 'root':
            next
        elif item.tag == 'row' and len(xml_data) > 0:
            writer.add_document(title=xml_data['title'], ID=xml_data['id'], url=xml_data['url'], textdata=xml_data['text'])
            xml_data = {}
        else:
            xml_data[item.tag] = item.text

    writer.commit()
コード例 #2
0
def make_search_service(search_text):
  charmap = charset_table_to_dict(default_charset)
  custom_analyzers = StemmingAnalyzer()

  index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir')
  myindex = open_dir(index_path)
  qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8})
  qstring = search_text
  q = qp.parse(qstring)

  results_list = []

  myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0))
  with myindex.searcher(weighting=myWeighting) as s:
    results = s.search(q, limit=30, terms=True)

    #forse cercavi e risultati relativi a
    corrected = s.correct_query(q, qstring)
    did_you_mean = str
    result_for = str
    if corrected.query != q:
      if len(results) < 1:
        results = s.search(qp.parse(corrected.string), limit=30, terms=True)
        result_for = corrected.string
      else:
        did_you_mean = corrected.string


    #query expansion
    keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)]
    if not keywords and keywords == " ":
      query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords))
      results_keyword = s.search(query_keyword, limit=30, terms=True)
      results.upgrade_and_extend(results_keyword)

    #sorting
    key_sort = lambda result: result.score
    results = sorted(results, key=key_sort, reverse=True)

    
    for ris in results:
      result = {}
      result['title'] = ris['title']
      result['url'] = ris['url']
      result['id'] = ris['ID']
      result['highlight'] = ris.highlights("textdata")
      results_list.append(result)


    #per calcolo precisione e recall
    id_results = [ris['id'] for ris in results_list[:10]]

    return {
      'search_text': search_text,
      'results': results_list, 
      'did_you_mean': did_you_mean,
      'result_for': result_for,
      'results_ids': id_results
    }
コード例 #3
0
ファイル: test_analysis.py プロジェクト: gsadikin/whoosh
def test_charset_pickeability():
    from whoosh.support import charset
    charmap = charset.charset_table_to_dict(charset.default_charset)
    ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap)
    _ = dumps(ana, -1)

    ana = analysis.CharsetTokenizer(charmap)
    _ = dumps(ana, -1)
コード例 #4
0
ファイル: test_analysis.py プロジェクト: CuteCha/dssm-theano
def test_charset_pickeability():
    from whoosh.support import charset
    charmap = charset.charset_table_to_dict(charset.default_charset)
    ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap)
    _ = dumps(ana, -1)

    ana = analysis.CharsetTokenizer(charmap)
    _ = dumps(ana, -1)
コード例 #5
0
ファイル: __init__.py プロジェクト: Bouska/memopol2
def MemopolAnal():
    charmap = charset_table_to_dict(default_charset)
    return anal.RegexTokenizer(r"\w+") | anal.LowercaseFilter() | anal.CharsetFilter(accent_map)
コード例 #6
0
ファイル: iTunesXML.py プロジェクト: athoune/ShareMyTunes
#!/usr/bin/env python

import urllib
from urlparse import urlparse
import os.path

from whoosh.support.charset import charset_table_to_dict, default_charset
from genshi.input import XMLParser, START, TEXT, END

__author__ = "mlecarme"

no_accent = charset_table_to_dict(default_charset)

"""
[TODO] indexing artworks
"""
class ItunesParser:
	"""
	Event based iTunes XML parser
	"""
	def __init__(self, path):
		self.stream = XMLParser(open(path,'r'))
	def __iter__(self):
		ouvrant = None
		indentation = 0
		tracks = False
		valeur = False
		piste = {}
		albums = set()
		artists = set()
		for kind, data, pos in self.stream:
コード例 #7
0
ファイル: indexing.py プロジェクト: rahulkant/i-search
def indexer():
    charmap = charset_table_to_dict(default_charset)
    my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) | StopFilter()

    schema = Schema(url=ID(stored=True),
                    title=TEXT(stored=True),
                    content=TEXT(stored=True,
                                 analyzer=my_analyzer,
                                 spelling=True),
                    data=STORED,
                    tags=KEYWORD(stored=True),
                    extension=TEXT(stored=True))

    if not os.path.exists("everywhere"):
        os.mkdir("everywhere")
    if not os.path.exists("pdf"):
        os.mkdir("pdf")
    if not os.path.exists("doc"):
        os.mkdir("doc")
    if not os.path.exists("tar"):
        os.mkdir("tar")
    if not os.path.exists("jpg"):
        os.mkdir("jpg")
    if not os.path.exists("forms"):
        os.mkdir("forms")

    i_a = index.create_in("everywhere", schema)
    writer_a = i_a.writer()

    i_b = index.create_in("pdf", schema)
    writer_b = i_b.writer()

    i_c = index.create_in("doc", schema)
    writer_c = i_c.writer()

    i_d = index.create_in("tar", schema)
    writer_d = i_d.writer()

    i_e = index.create_in("jpg", schema)
    writer_e = i_e.writer()

    i_f = index.create_in("forms", schema)
    writer_f = i_f.writer()

    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    specific = [
        '.jpg', '.exe', '.pdf', '.doc', '.zip', '.xls', 'pptx', 'docx', 'r.gz',
        '.iso', 'jpeg', '.gif', '.png'
    ]
    ignore = ['calendar', 'events', 'mailto']
    with open('intranet/crawled.txt', "r") as fp:
        num = 0
        for line in fp:
            num = num + 1
            print("Extracting link" + str(num))
            line = line.replace('\n', '')
            #if line[-4: ] not in specific:
            if all(item not in line.lower() for item in ignore):
                try:
                    if all(item not in line.lower() for item in specific):
                        print(line)
                        html = urlopen(line)
                        soup = BeautifulSoup(html, "html.parser")
                        #soup = BeautifulSoup(html, "html.parser")
                        for script in soup(["script", "style"]):
                            script.extract()

                        try:
                            heading = soup.title.string
                        except AttributeError:
                            heading = "line"
                    #print (str(heading))

                        try:
                            content = soup.body.get_text()

                        except AttributeError:
                            content = ""

                        tags = ""
                        try:
                            for h in soup.findAll(
                                ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']):
                                tags = tags + " " + h.string
                        except:
                            pass
                    else:
                        #pattern = re.compile('[\W_]+')
                        heading = line
                        #heading = pattern.sub(' ',heading)
                        #re.sub(r'[\W_]+','', heading)
                        #heading = heading.split()
                        content = line.split()
                        tags = ""

                    title = str(heading)
                    #print (title)
                    tags = str(tags)
                    content = str(content)
                    #print ("content")
                    url = str(line)
                    extension = str(line[-4:])
                    writer_a.add_document(url=url,
                                          title=title,
                                          data=content,
                                          content=content,
                                          tags=tags,
                                          extension=extension)
                    if "pdf" in line.lower():
                        writer_b.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to pdf")
                    elif (".doc"
                          in line.lower()) or (".ppt" in line.lower()) or (
                              ".xls" in line.lower()) or (
                                  "docx" in line.lower()) or (".ppt"
                                                              in line.lower()):
                        writer_c.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to doc")
                    elif (".exe"
                          in line.lower()) or (".iso" in line.lower()) or (
                              ".zip" in line.lower()) or ("r.gz"
                                                          in line.lower()):
                        writer_d.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to tar")
                    elif (".jpeg"
                          in line.lower()) or (".jpg" in line.lower()) or (
                              ".gif" in line.lower()) or (".png"
                                                          in line.lower()):
                        writer_e.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to jpg")
                    elif "form" in line.lower():
                        writer_f.add_document(url=url,
                                              title=title,
                                              data=content,
                                              content=content,
                                              tags=tags,
                                              extension=extension)
                        print("added to form")
                    else:
                        print("adding to everywhere")

                    #writer_a.add_document(url=url, title=title, data=content, content=content, tags=tags)
                    print("added To whoosh")
                except urllib.error.HTTPError:
                    print("HTTP Error")
                    #	test = "True"
                except (ConnectionResetError, urllib.error.URLError):
                    print("Connection Reset Fail")
            else:
                print("ignored this url")
        writer_a.commit()
        writer_b.commit()
        writer_c.commit()
        writer_d.commit()
        writer_e.commit()
        writer_f.commit()
コード例 #8
0
import os
import json
import string
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.writing import AsyncWriter
from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, CharsetFilter
from whoosh.support.charset import default_charset, charset_table_to_dict

charmap = charset_table_to_dict(default_charset)


def processText(text):
    """ Questo metodo si occupa di processare il testo prima di inserirlo nell'index.
        Nello specifico, elimina i caratteri di punteggiatura e scarta le parole lunghe solo una lettera.
        Inoltre, elimina anche le stopwords, esegue lo stemming delle parole e normalizza
        le lettere accentate ed altre lettere in testo appartenente all'ASCII
        :rtype: list
    """

    #(, filterStopwords=False, stemming=False, normalizeAccents=False, minLength=1)

    # tokenizzazione
    # tokens = nltk.wordpunct_tokenize(text)

    # tokenizer = RegexTokenizer()

    # if stemming:
    # 	if filterStopwords:
    # 		analyzer = StemmingAnalyzer()
    # 	else: