コード例 #1
0
(i.e. law in most cases) may contain. 

law_name -- name of the document. Searchable and stored.
law_body -- the intro and articles of a law. Searchable only.
law_num_date -- the number of the law and the exact date. Searchable and stored.
pub_year -- the date of the Official Gazette publication.
article_one -- title and first few sentences of article one. Stored only for displaying in search results.

"""

schema = Schema(
    law_name=TEXT(analyzer=lang_ana, stored=True),
    law_body=TEXT(analyzer=lang_ana),
    law_num_date=ID(stored=True),

    # A doc can have multiple
    agency_tag=KEYWORD(stored=True),
    content_type_tag=KEYWORD(stored=True),
    pub_year=NUMERIC(sortable=True, stored=True),
    article_one_title=STORED,
    article_one_str=STORED)

# CREATE AN INDEX
"""
The documents will be stored according to the defined schema.
Fields that are indexed can be "searched." Some fields can be 
stored without being indexed... just to show up search results.
"""

# To create (or open existing) index directory
if os.path.exists("scripts/indexdir"):
コード例 #2
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
コード例 #3
0
    if i[1] >= 10:
        bookdict_all_sort_upper10.append(i)

remove_words = set({
    "(", ")", "(", ")", "[", "]", "「", "」", "+", "-", "*", "$", "'", '"', "、",
    ".", "”", "’", ":", ";", "_", "/", "?", "!", "。", ",", "=", "=", " ", '『',
    '』'
})

# sudachiの設定
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

# 1文字を許可するためcontentのstoplistを無効化
schema = Schema(title=TEXT(stored=True),
                content=TEXT(stored=True,
                             analyzer=StandardAnalyzer(stoplist=None)),
                count=NUMERIC(stored=True, sortable=True))
if not os.path.exists("heroku_index"):
    os.mkdir("heroku_index")
ix = create_in("heroku_index", schema)

# index作成
writer = ix.writer()
for num in range(len(bookdict_all_sort_upper10)):
    titlewords = set([
        m.surface() for m in tokenizer_obj.tokenize(
            bookdict_all_sort_upper10[num][0], mode)
    ])
    titlewords = titlewords.union(
        set([
            m.normalized_form() for m in tokenizer_obj.tokenize(
コード例 #4
0
def main():
    file_content_doc1 = open("rural_min.txt").read()
    file_content_doc2 = open("science_min.txt").read()
    option = True
    while option:
        print("""
        1. Create Index.
        2. Query Index.
        3. Exit
        """)
        option = input("Please select an option...!")
        if option == "1":

            sent_tokenize_list1 = sent_tokenize(file_content_doc1,
                                                language='english')
            sent_tokenize_list2 = sent_tokenize(file_content_doc2,
                                                language='english')
            if not os.path.exists("index_task3_min"):
                os.mkdir("index_task3_min")

            my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | Lemmatizer()
            pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | PosTagger()
            wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets()
            wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets1()
            wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets2()
            wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets3()

            schema = Schema(id=ID(stored=True, unique=True),
                            standard=TEXT(stored=True,
                                          analyzer=StandardAnalyzer()),
                            stem_text=TEXT(stored=True,
                                           analyzer=StemmingAnalyzer()),
                            lemma=TEXT(stored=True, analyzer=my_analyzer),
                            pos_text=TEXT(stored=True, analyzer=pos_tagger),
                            hypernym=TEXT(stored=True, analyzer=wordnetsyn1),
                            hyponym=TEXT(stored=True, analyzer=wordnetsyn2),
                            holonym=TEXT(stored=True, analyzer=wordnetsyn3),
                            meronyms=TEXT(stored=True, analyzer=wordnetsyn4),
                            dependency=TEXT(analyzer=DependencyParser()))

            ix = index.create_in("index_task3_min", schema)
            writer = ix.writer()

            for sentence in sent_tokenize_list1:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            for sentence in sent_tokenize_list2:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            writer.commit()

            print_index_details(ix)

            print("\n\n Index created with various features as its fields")

        elif option == "2":
            ix = index.open_dir("index_task3")

            with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher:
                og = qparser.OrGroup.factory(0.5)
                q = input("\n Insert a query...!")
                query_text = MultifieldParser([
                    "standard", "stem_text", "lemma", "pos_text", "hyponym",
                    "meronyms", "hypernym", "holonym"
                ],
                                              schema=ix.schema,
                                              group=og).parse(q)
                results = searcher.search(query_text, limit=10)
                for i, hit in enumerate(results):
                    print(results.score(i), hit["standard"], sep=":")
                    print("\n")

        elif option == "3":
            print("\n Goodbye")
            sys.exit(0)
            option = None
        else:
            print("\n Not valid choice try again...!")
コード例 #5
0
ファイル: web.py プロジェクト: gaosaroma/Web-Design
import pymongo
from flask import Flask, render_template, request, jsonify
from whoosh.fields import Schema, TEXT, ID, KEYWORD
from whoosh.index import create_in, open_dir
from whoosh.qparser import MultifieldParser
# jieba已经内置了和whoosh的集成功能
from jieba.analyse import ChineseAnalyzer

# 构建分词
analyzer = ChineseAnalyzer()

# 在url, title, tags, note, article中进行搜索,
# 然后根据搜索到的_id再去数据库中获取数据.
schema = Schema(
    nid=ID(unique=True, stored=True),
    title=TEXT(phrase=False),
    tags=KEYWORD(lowercase=True, commas=True, scorable=True),
    people=KEYWORD(lowercase=True, commas=True, scorable=True),
)


# 创建文件内索引
def init_search():
    if os.path.exists("indexdir"):
        shutil.rmtree('indexdir')
    os.mkdir("indexdir")
    create_in("indexdir", schema)
    return open_dir("indexdir")


# 初始化检索
ix = init_search()
コード例 #6
0
ファイル: views.py プロジェクト: ag-gipp/docker-receval
    def get_absolute_url(self):
        return reverse('recommendations') + '?seed=%s' % self.paper_id

    def get_title(self):
        return self.title

    def set_rank(self, rank):
        self.rank = rank
        return self


paper_schema = Schema(
    paper_id=ID(stored=True),
    title=TEXT(stored=True),
    abstract=TEXT(analyzer=StemmingAnalyzer()),
    paper_url=TEXT(),
    aspect_tasks=KEYWORD,
    aspect_methods=KEYWORD,
    aspect_datasets=KEYWORD,
)

if settings.ASPECT_KNN_WHOOSH_INDEX_PATH and os.path.exists(settings.ASPECT_KNN_WHOOSH_INDEX_PATH):
    ix = index.open_dir(settings.ASPECT_KNN_WHOOSH_INDEX_PATH)  #'/Users/maos01/Desktop/special-docembeds-release-files/output/pwc/whoosh_index'
else:
    ix = None


# Load vector models
generic_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_GENERIC_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_GENERIC_W2V_PATH and os.path.exists(settings.ASPECT_KNN_GENERIC_W2V_PATH) else None #  '/Users/maos01/Downloads/specter.1k.w2v.txt'
task_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_TASK_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_TASK_W2V_PATH and os.path.exists(settings.ASPECT_KNN_TASK_W2V_PATH) else None
method_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_METHOD_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_METHOD_W2V_PATH and os.path.exists(settings.ASPECT_KNN_METHOD_W2V_PATH) else None
コード例 #7
0
ファイル: GilIndexer.py プロジェクト: gilwalzer/ir-kgram
from nltk import PorterStemmer, SnowballStemmer
from whoosh import index
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED

p_s = PorterStemmer()
s_s = SnowballStemmer("english")
kgram_numbers = [4, 5]  #int(sys.argv[1])
translate_tab = string.maketrans("", "")

stops = stopwords.words("english")
stopset = set()
for each in stops:
    stopset.add(each)

my_schema = Schema(docId=ID(stored=True),
                   title=TEXT(stored=True),
                   body=TEXT(),
                   tags=KEYWORD(stored=True))


def strip_content(content):
    sb = ""
    title = ""
    c = content.split("\n")
    for line in c:
        if line is not "\n" and line is not "" and len(line) > 1:
            title = line.rstrip()
            print(title)
            break
    for line in c:
        try:
            q = line.decode('ascii')
コード例 #8
0
    def test_build_attrs(self):
        schema = Schema()
        adapter = SAAdapter(SANotIndexable, schema)
        assert not adapter.indexable
        assert adapter.doc_attrs == {}

        adapter = SAAdapter(Entity, schema)
        assert adapter.indexable == False

        adapter = SAAdapter(SubclassEntityIndexable, schema)
        assert adapter.indexable
        assert set(adapter.doc_attrs) == {
            'object_key',
            'id',
            'name',
            'slug',
            'object_type',
            'text',
            'created_at',
            'updated_at',
            'name_prefix',
            'owner',
            'owner_name',
            'creator_name',
            'creator',
            'allowed_roles_and_users',
            'tag_ids',
            'tag_text',
        }
        assert all(lambda f: callable(f)
                   for f in six.itervalues(adapter.doc_attrs))

        assert set(schema.names()) == {
            'object_key',
            'id',
            'object_type',
            'name',
            'slug',
            'text',
            'created_at',
            'updated_at',
            'name_prefix',
            'owner',
            'owner_name',
            'creator_name',
            'creator',
            'allowed_roles_and_users',
            'tag_ids',
            'tag_text',
        }

        schema = Schema(id=NUMERIC(numtype=int,
                                   bits=64,
                                   signed=False,
                                   stored=True,
                                   unique=True), )
        adapter = SAAdapter(Indexable, schema)
        assert adapter.indexable
        assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'}
        assert all(lambda f: callable(f)
                   for f in six.itervalues(adapter.doc_attrs))

        assert set(schema.names()) == {'id', 'text', 'num', 'name'}
        assert isinstance(schema['text'], TEXT)
        assert isinstance(schema['num'], NUMERIC)
コード例 #9
0
          countryOfOrigin text,
          overview text)''')
c.close()

# initialise sentic net
sn = SenticNet()
# does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords
hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter()

SCHEMA = Schema(
    filename=ID(unique=True, stored=True, analyzer=hsn_analyzer),
    content=TEXT(analyzer=hsn_analyzer, spelling=True),
    price=NUMERIC(sortable=True, stored=True),
    rating=NUMERIC(sortable=True, stored=True),
    noOfReviews=NUMERIC(sortable=True, stored=True),
    savings=NUMERIC(sortable=True, stored=True),
    percentageSavings=NUMERIC(sortable=True, stored=True),
    review=TEXT(analyzer=hsn_analyzer, spelling=True),
    productDesc=TEXT(stored=True),
    reviewPolarity=NUMERIC(sortable=True, stored=True),
    countryOfOrigin=TEXT(sortable=True, stored=True),
    overview=TEXT(stored=True),
)


# function to check if a string contains a float
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
コード例 #10
0
from whoosh.filedb.filestore import RamStorage
from whoosh.qparser import QueryParser
from whoosh.query import Term
from whoosh.fields import Schema, TEXT, ID, KEYWORD

from corpint.core import project
from corpint.model.entity import Entity

schema = Schema(uid=ID(stored=True),
                fingerprint=TEXT,
                country=KEYWORD,
                name=TEXT(stored=True))


class EntityIndex(object):
    def __init__(self):
        storage = RamStorage()
        self.index = storage.create_index(schema)

    def build(self):
        project.log.info("Building entity search index...")
        writer = self.index.writer()
        q = Entity.find_by_origins(origins=[])
        q = q.filter(Entity.active == True)  # noqa
        count = 0
        for entity in q:
            for fp in entity.fingerprints:
                writer.add_document(uid=entity.uid,
                                    fingerprint=fp,
                                    country=entity.country,
                                    name=entity.name)
コード例 #11
0
def get_response_schema():
    return Schema(link_tema=ID(stored=True),
                  fecha=DATETIME(stored=True),
                  texto=TEXT(stored=True),
                  autor=TEXT(stored=True))
コード例 #12
0
from whoosh.qparser import QueryParser
import re, os, codecs, sys


#Function that removes all the lines with tags style and script and document and head and title and also comments
def visible(element):
    if element.parent.name in [
            'style', 'script', '[document]', 'head', 'title'
    ]:
        return False
    elif re.match('<!--.*-->', element.encode('utf-8')):
        return False
    return True


schema = Schema(id=ID(stored=True), content=TEXT(stored=True))
dir = os.listdir('sample')
ix = create_in("database", schema)
for l in dir:
    print l
    p = "sample/" + l
    html = codecs.open(p, "r", "utf-8").read()
    soup = BeautifulSoup(html, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(visible, texts)
    s = u''
    for elem in visible_texts:
        if (elem != u''):
            s += elem.strip(" \n\t\r") + " "
    writer = ix.writer()
    writer.add_document(id=l.strip(".html").decode('utf-8'), content=s)
コード例 #13
0
def test_build_attrs_2() -> None:
    schema = Schema()
    adapter = SAAdapter(Entity, schema)
    assert adapter.indexable == False
コード例 #14
0
def test_build_attrs_1() -> None:
    schema = Schema()
    adapter = SAAdapter(SANotIndexable, schema)
    assert not adapter.indexable
    assert adapter.doc_attrs == {}
コード例 #15
0
ファイル: whooshfdw.py プロジェクト: shirou/whooshfdw
 def __init__(self, options, columns):
     super(WhooshFDW, self).__init__(options, columns)
     self.columns = columns
     self.indexdir = options["indexdir"]
     self.schema = Schema(title=NGRAM(stored=True))
コード例 #16
0
ファイル: movie_store.py プロジェクト: tonyjhuang/ferriswheel
from whoosh.fields import Schema, KEYWORD, TEXT
from whoosh.filedb.filestore import RamStorage
from whoosh.qparser import QueryParser

SCHEMA = Schema(title=TEXT(stored=True), keywords=KEYWORD)


class MovieStore:
    """Interface for searching movies by keyword."""
    def __init__(self, data_source):
        self.index = RamStorage().create_index(SCHEMA)
        self.data_source = data_source

    def initialize(self):
        writer = self.index.writer()
        for doc in self.data_source.get_documents():
            writer.add_document(**doc)
        writer.commit()

    def query_for_titles(self, keywords):
        with self.index.searcher() as searcher:
            query = QueryParser("keywords",
                                self.index.schema).parse(" ".join(keywords))
            return map(lambda res: str(res['title']), searcher.search(query))
コード例 #17
0
ファイル: practicaWHB.py プロジェクト: sergioperez1998/AII
def get_schema():
    return Schema(titulo=TEXT(stored=True),
                  fecha=DATETIME(stored=True),
                  enlace=TEXT(stored=True),
                  resumen=TEXT(stored=True),
                  nombrefichero=ID(stored=True))
コード例 #18
0
ファイル: Indexer.py プロジェクト: Mrnmap/ALLInfo
    publisher.channel = "Script"

    config_section = 'Indexer'

    p = Process(config_section)

    # Indexer configuration - index dir and schema setup
    baseindexpath = join(os.environ['AIL_HOME'],
                         p.config.get("Indexer", "path"))
    indexRegister_path = join(os.environ['AIL_HOME'],
                              p.config.get("Indexer", "register"))
    indexertype = p.config.get("Indexer", "type")
    INDEX_SIZE_THRESHOLD = int(p.config.get("Indexer", "index_max_size"))
    if indexertype == "whoosh":
        schema = Schema(title=TEXT(stored=True),
                        path=ID(stored=True, unique=True),
                        content=TEXT)
        if not os.path.exists(baseindexpath):
            os.mkdir(baseindexpath)

        # create the index register if not present
        time_now = int(time.time())
        if not os.path.isfile(indexRegister_path):  #index are not organised
            print("Indexes are not organized")
            print("moving all files in folder 'old_index' ")
            #move all files to old_index folder
            move_index_into_old_index_folder(baseindexpath)
            print("Creating new index")
            #create all_index.txt
            with open(indexRegister_path, 'w') as f:
                f.write(str(time_now))
コード例 #19
0
from flask import request, jsonify

from whoosh import index, scoring
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.qparser import QueryParser
from whoosh.analysis import SimpleAnalyzer, CharsetFilter, KeywordAnalyzer
#from whoosh.support.charset import accent_map

# ===== GLOBAL VARIABLES ========

file_dir = "files"
index_dir = "index"
response = []
schema_name = "search"
schema = Schema(path=ID(unique=True, stored=True),
                time=STORED,
                content=TEXT(analyzer=KeywordAnalyzer()))

#my_analyzer = SimpleAnalyzer() | CharsetFilter(accent_map)
#schema_fuzzy = Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=my_analyzer))

# ========= REST API =============
app = flask.Flask(__name__)
app.config["DEBUG"] = True


# Home
@app.route('/', methods=['GET'])
def home():
    return "<h1>Home</h1><p>Try to pass search terms with: http://127.0.0.1:5000/search?q=comma,separated,search,terms</p>"
コード例 #20
0
import json
import os

from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.fields import Schema

# 创建schema, stored为True表示能够被检索
schema = Schema(link=ID(stored=True),  # 文件的链接
                filetype=ID(stored=True)  # 文件类型
           )


def getFiles(path):
    file = open(path, 'r')
    content = file.read()
    files = json.JSONDecoder(strict=False).decode(content)
    return files


def createIndex():
    global ix
    if not os.path.exists("index_for_imag"):
        os.mkdir("index_for_imag")
        ix = create_in("index_for_imag", schema)
    else:
        ix = open_dir('index_for_imag')
    writer = ix.writer()
    # 电光学院
    paths = ['/Users/lww/PycharmProjects/WebSearchingSystem/Spider/history_images.json']
    for path in paths:
コード例 #21
0
def get_schema():
    return Schema(remitente=TEXT(stored=True),
                  destinatarios=KEYWORD(stored=True),
                  asunto=TEXT(stored=True),
                  contenido=TEXT(stored=True))
コード例 #22
0
from whoosh.qparser import MultifieldParser
from whoosh.query import And, Every, Term

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters

log = logging.getLogger(__name__)

schema = Schema(
    id=NUMERIC(stored=True),
    name=TEXT(field_boost=1.7, stored=True),
    description=TEXT(field_boost=1.5, stored=True),
    long_description=TEXT(stored=True),
    homepage_url=TEXT(stored=True),
    remote_repository_url=TEXT(stored=True),
    repo_owner_username=TEXT(stored=True),
    categories=KEYWORD(stored=True, commas=True, scorable=True),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    repo_lineage=STORED,
    full_last_updated=STORED)


class RepoWeighting(scoring.BM25F):
    """
    Affect the BM25G scoring model through the final method.
    source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ
    """
    use_final = True
コード例 #23
0
class Index:
    schema = Schema(content=TEXT(stored=True, analyzer=Analyzer()),
                    user=ID(stored=True),
                    mentionsUsers=KEYWORD(stored=True),
                    mentionsRoles=KEYWORD(stored=True),
                    time=DATETIME)

    def __init__(self,
                 dir,
                 authorIds={},
                 context=None,
                 start=True,
                 baseDir=None):
        if not os.path.isdir(dir):
            os.mkdir(dir)

        if not list(os.listdir(dir)):
            self.ix = create_in(dir, Index.schema)

        if not baseDir:
            baseDir = os.path.join(os.path.split(dir)[0])

        self.ix = whoosh.index.open_dir(dir)
        self.searchers = []
        self.failedDir = os.path.join(baseDir, "failed")
        utils.ensureDir(self.failedDir)
        self.incomingDir = os.path.join(baseDir, "incoming")
        utils.ensureDir(self.incomingDir)
        self.indexer = threading.Thread(target=Index.indexLoop, args=[self])
        self.logger = open(os.path.join(baseDir, "index.log"), "a")
        self.stopping = False
        if start:
            self.startIndexer()
        self.counts = {}

    def getCounts(self, uid):
        if uid in self.counts:
            return self.counts[uid]
        else:
            with self.getSearcher() as searcher:
                userNode = whoosh.query.Term("user",
                                             uid)  # userId in the user field

                results = searcher.search(userNode)
                self.counts[uid] = len(results)
                return len(results)

    def getLast(self, uid, number):
        with self.getSearcher() as searcher:
            userNode = whoosh.query.Term("user",
                                         uid)  # userId in the user field

            results = searcher.search(
                userNode,
                #sortedby="time",
                limit=number)
            return deduper(results, dedupe=True)

    def startIndexer(self):
        self.indexer.start()
        self.stopping = False

    def __del__(self):
        self.stopping = True
        if self.indexer.is_alive():
            self.indexer.join()

    def log(self, text):
        print(text)
        self.logger.write(text)
        self.logger.write("\n")
        self.logger.flush()

    def indexLoop(self):
        print("Beginning index loop")
        writer = self.ix.writer()
        while not self.stopping:
            path = None
            try:
                for file in os.listdir(self.incomingDir):
                    self.log("indexing {0}\n".format(file))
                    path = os.path.join(self.incomingDir, file)
                    with open(path, "r", encoding="utf-8") as f:
                        for line in f:
                            doc = json.loads(line)
                            ts = doc["timestamp"]
                            t = datetime.datetime.fromtimestamp(ts)
                            userid = "{0}".format(doc["user"])
                            writer.add_document(
                                content=doc["content"].strip(),
                                user=userid,
                                mentionsUsers=",".join(doc["mentions"]),
                                mentionsRoles=",".join(doc["role_mentions"]),
                                time=t)
                    writer.commit()
                    self.log("committed {0}\n".format(file))
                    writer = self.ix.writer()
                    for i in range(0, 5):
                        try:
                            os.remove(path)
                            break
                        except:
                            pass
                    self.searchers = []
                    self.getSearcher()
                else:
                    time.sleep(10)

            except Exception as e:
                print(str(e))
                self.log(str(e))
                try:
                    if path:
                        if not os.path.isdir(self.failedDir):
                            os.mkdir(self.failedDir)
                        shutil.move(path, os.path.join(self.failedDir, file))
                except Exception as ee:
                    print(str(ee))
                    self.log(str(ee))
                    raise

    class ScopedSearcher:
        def __init__(self, parent, **kwargs):
            self.parent = parent
            self.handle = None
            self.args = kwargs
            self.time = time.time()

        def __enter__(self):
            try:
                self.handle = self.parent.searchers.pop()
                if (time.time() - self.handle.time) > 60:
                    self.handle = self.handle.refresh()
                    setattr(self.handle, 'time', time.time())
                    self.parent.log("Refreshed handle")
                return self.handle
            except:
                self.handle = self.parent.ix.searcher(**self.args)
                setattr(self.handle, 'time', time.time())
                return self.handle

        def __exit__(self, a, b, c):
            self.parent.searchers.append(self.handle)
            self.handle = None

    def getSearcher(self, **kwargs):
        return Index.ScopedSearcher(self, **kwargs)

    def queryStats(self, text, expand=False, timer=NoTimer()):
        """
            Returns a sorted tuple of (count, userName)
        """
        with timer.sub_timer("query-stats") as t:
            with self.getSearcher() as searcher:
                from whoosh.qparser import QueryParser
                if expand:
                    qp = QueryParser("content",
                                     schema=self.ix.schema,
                                     termclass=whoosh.query.Variations)
                else:
                    qp = QueryParser("content", schema=self.ix.schema)
                q = qp.parse(text)

                with t.sub_timer("searcher.search") as s:
                    results = searcher.search(q, limit=100000)

                with t.sub_timer("results") as s:
                    counts = defaultdict(lambda: 0)

                    with s.sub_timer("counts") as r:
                        for r in results:
                            u = r["user"]
                            counts[u] += 1

                    with s.sub_timer("reverse") as r:
                        counts = [(count, id) for id, count in counts.items()
                                  if count > 0]
                        sc = reversed(sorted(counts))
                        return [v for v in sc]

    def deDupeResults(self, text, ret):
        exists = set([text.lower()])
        i = len(ret) - 1
        while i >= 0:
            r = ret[i]
            if not r[1].lower() in exists:
                exists.add(r[1].lower())
            else:
                del ret[i]
            i = i - 1
        return ret

    def queryLong(self, text, max=3, user=None, expand=False, timer=NoTimer()):
        with timer.sub_timer("query-long") as t:
            for attempt in range(0, 3):
                with t.sub_timer(attempt) as s:
                    results = self.query(text,
                                         max * (2 + attempt),
                                         user,
                                         expand=(expand or (attempt > 0)),
                                         timer=t,
                                         dedupe=True)
                    ret = list(results)

                    if len(ret) >= max:
                        ret = ret[:max]
                        break
            return ret

    def queryUserOrI(self,
                     text,
                     max=3,
                     userId=None,
                     userName=None,
                     expand=False,
                     dedupe=False):
        with self.getSearcher(weighting=whoosh.scoring.TF_IDF) as searcher:
            from whoosh.qparser import QueryParser
            qp = QueryParser("content", schema=self.ix.schema)
            i_node = qp.parse("I")
            i_node.fieldname = "content"  # "I" in content

            userNode = whoosh.query.Term("user",
                                         userId)  # userId in the user field

            user_i_node = whoosh.query.And([userNode])  #, i_node])

            userTextNode = qp.parse(userName)
            userTextNode.fieldname = "content"

            subjectNode = whoosh.query.Or([userTextNode, user_i_node])

            qp2 = QueryParser("content",
                              schema=self.ix.schema,
                              termclass=whoosh.query.Variations)
            textNode = qp2.parse(text)
            textNode.fieldname = "content"

            q = whoosh.query.And([textNode, subjectNode])

            results = searcher.search(q, limit=max)

            return Results(results)

    def query(self,
              text,
              max=3,
              user=None,
              expand=False,
              userNames=[],
              dedupe=False,
              timer=Timer("index.query")):
        """ text: the main text query of the content. expand=bool applies to this. 
                  if user or userNames are supplied, text is restricted to content (else no field res)
            user: id of a user to restrict to
            userNames: ORed with 'user', but a text search in content :/
                  """
        with timer.sub_timer("query") as ot:
            with self.getSearcher(weighting=whoosh.scoring.TF_IDF) as searcher:
                with ot.sub_timer("inner-q") as t:
                    with t.sub_timer("query-parse") as s:
                        if expand:
                            qp = QueryParser("content",
                                             schema=self.ix.schema,
                                             termclass=whoosh.query.Variations)
                        else:
                            qp = QueryParser("content", schema=self.ix.schema)
                        nonExpandQP = QueryParser("content",
                                                  schema=self.ix.schema)

                        userNodes = []
                        # Massive spaghetti here
                        textNode = qp.parse(text)
                        textNode.fieldname = "content"
                        if user:
                            userNode = whoosh.query.Term("user", user)
                            userNodes.append(userNode)
                        if userNames:
                            q2 = nonExpandQP.parse(" OR ".join(userNames))
                            q2.field = "content"
                            userNodes.append(q2)
                        q = textNode
                        if userNodes:
                            u = whoosh.query.Or(userNodes)
                            q = whoosh.query.And([q, u])

                    with t.sub_timer("searcher.search") as s:
                        results = searcher.search(q, limit=max)

                    return deduper(results, dedupe=dedupe)

    async def collect_terms(self,
                            t,
                            usernames,
                            corpusThresh,
                            freq,
                            minScore,
                            corpusSize,
                            filters={},
                            timer=NoTimer()):
        with timer.sub_timer("collect_terms outer") as t_:
            with self.getSearcher() as s:
                ret = []
                q = whoosh.query.Term("content", t)
                for u in usernames:
                    uq = filters.get(u, whoosh.query.Term("user", u))
                    with t_.sub_timer("search") as tt_:
                        res = s.search(q, limit=100000000, filter=uq)

                    with t_.sub_timer("length") as tt_:
                        occs = res.scored_length()

                    with t_.sub_timer("counting") as tt_:
                        if occs and occs > corpusThresh * freq:

                            score = (occs / self.getCounts(u)) / (freq /
                                                                  corpusSize)
                            if score > 0:
                                score = math.log(score) * 10
                                if score > minScore:
                                    ret.append((u, t, score))
                return ret

    async def terms_async(self,
                          usernames,
                          corpusThresh=0.6,
                          corpusNorm=False,
                          minScore=450,
                          timer=NoTimer()):
        ret = []
        with timer.sub_timer("getCounts") as t:
            totalCounts = {u: self.getCounts(u) for u in usernames}
        num = re.compile(r"^\d+$")

        with timer.sub_timer("getReader") as t:
            reader = self.ix.reader()

        with timer.sub_timer("numDocs") as t:
            numDocs = reader.doc_count()

        with timer.sub_timer("initFilters") as t:
            with self.getSearcher() as s:
                filters = {
                    u: s.search(whoosh.query.Term("user", u))
                    for u in usernames
                }

        with timer.sub_timer("termLoop") as t_:
            for t in reader.field_terms("content"):
                if num.match(t):
                    continue
                if len(t) < 3:
                    continue
                with t_.sub_timer("termFreq") as t__:
                    freq = reader.frequency("content", t)

                if freq > 50 and freq < numDocs / 100:
                    try:
                        ret += await self.collect_terms(t,
                                                        usernames,
                                                        corpusThresh,
                                                        freq,
                                                        minScore,
                                                        numDocs,
                                                        filters=filters,
                                                        timer=t_)
                    except Exception as e:
                        print(
                            "Error while iterating through terms: {0}".format(
                                e))
        return ret

    def terms(self,
              usernames,
              corpusThresh=0.6,
              corpusNorm=False,
              minScore=450):
        ret = []
        totalCounts = {u: self.getCounts(u) for u in usernames}
        num = re.compile(r"^\d+$")
        reader = self.ix.reader()
        numDocs = reader.doc_count()

        for t in reader.field_terms("content"):
            if num.match(t):
                continue
            if len(t) < 3:
                continue
            freq = reader.doc_frequency("content", t)
            if freq > 50 and freq < numDocs / 100:
                #  print("{0}: {1}".format(t, freq))
                with self.getSearcher() as s:
                    q = whoosh.query.Term("content", t)
                    uq = whoosh.query.Or(
                        [whoosh.query.Term("user", u) for u in usernames])
                    qry = whoosh.query.And([q, uq])
                    res = s.search(qry, groupedby="user")
                    res.estimated_length()
                    d = res.groups("user")
                    for u, ary in d.items():
                        if len(ary) > corpusThresh * freq:
                            score = 1000000 * len(ary) / totalCounts[u] / freq
                            if score > minScore:
                                ret.append((u, t, score))
                                break
        return ret

    def countUserMentionsOthers(self, uid):
        with self.getSearcher() as s:
            uq = whoosh.query.Term("mentionsUsers", "*")

            qp = QueryParser("mentionsUsers", schema=self.ix.schema)

            tq = qp.parse("*")

            uq = whoosh.query.Term("user", uid)

            q = whoosh.query.And([uq, tq])
            res = s.search(q, limit=10000000)
            return res

    def getMentionGraph(self, coreUsers: list):
        qp = QueryParser("mentionsUsers", schema=self.ix.schema)
        tq = qp.parse("*")  # something in mentionUsers
        ret = defaultdict(lambda: defaultdict(int))
        with self.getSearcher() as s:
            for uid in coreUsers:
                uq = whoosh.query.Term("user", uid)  # limit to uid

                q = whoosh.query.And([uq, tq])
                res = s.search(q, limit=10000000)
                thisUserContrib = ret[uid]
                for r in res:
                    mentions = r["mentionsUsers"]
                    mentions = mentions.split(",")
                    for m in mentions:
                        thisUserContrib[m] += 1
        return ret

    def whoMentions(self, target: str, names: set):
        if type(names) != set:
            names = set(names)
        with self.getSearcher() as s:
            q = whoosh.query.Or(
                [whoosh.query.Term("content", n) for n in names])
            uq = whoosh.query.Term("mentionsUsers", target)
            qry = whoosh.query.Or([q, uq])
            res = s.search(qry, limit=10000000)

            counts = defaultdict(int)
            for r in res:
                counts[r["user"]] += 1

            return counts

    def getTimes(self, userId):
        import whoosh.sorting
        from datetime import datetime, timedelta

        uq = whoosh.query.Term("user", userId)

        end = datetime.utcnow()
        end = datetime(end.year, end.month, end.day)
        gap = timedelta(hours=6)
        start = end - 180 * gap

        facet = whoosh.sorting.DateRangeFacet("time", start, end, gap)
        with self.getSearcher() as s:
            r = s.search(uq, groupedby=facet)

        g = r.groups()

        return g
コード例 #24
0
# Note: Has to be run from Code folder

# Decrypt files
import decrypt # removing the .py fixed the error, but I don't know why?

# Create/open index
import os.path
from whoosh.fields import Schema, TEXT, ID, KEYWORD
from whoosh import index

schema = Schema(content=TEXT, path=ID(stored=True), tags=KEYWORD(scorable=True))

#print(os.getcwd())
homeDir = os.path.realpath(__file__).replace('Code/search.py','')
if not os.path.exists('Index'):
    os.mkdir('Index')
    ix = index.create_in("Index", schema)
else:
    ix = index.open_dir("Index")


# Add new files to index
import json
database = os.path.realpath(__file__).replace('Code/search.py','database.json')
writer = ix.writer()

with open(database,'r') as f:
    data = json.load(f)
    for item in data:
        if item['indexed'] == False:
        
コード例 #25
0
from galaxy.exceptions import ObjectNotFound
import logging
log = logging.getLogger(__name__)

eggs.require("Whoosh")
import whoosh.index
from whoosh import scoring
from whoosh.fields import Schema, STORED, TEXT
from whoosh.qparser import MultifieldParser

schema = Schema(id=STORED,
                name=TEXT(field_boost=1.7, stored=True),
                description=TEXT(field_boost=1.5, stored=True),
                long_description=TEXT(stored=True),
                homepage_url=TEXT(stored=True),
                remote_repository_url=TEXT(stored=True),
                repo_owner_username=TEXT(stored=True),
                times_downloaded=STORED,
                approved=STORED,
                last_updated=STORED,
                full_last_updated=STORED)


class RepoWeighting(scoring.BM25F):
    """
    Affect the BM25G scoring model through the final method.
    source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ
    """
    use_final = True

    def final(self, searcher, docnum, score):
コード例 #26
0
from galaxy.webapps.tool_shed import config
from galaxy.webapps.tool_shed import model
from galaxy.tools.loader_directory import load_tool_elements_from_path

from galaxy import eggs
eggs.require( "SQLAlchemy" )
eggs.require( "Whoosh" )
from whoosh.filedb.filestore import FileStorage
from whoosh.fields import Schema, STORED, TEXT

repo_schema = Schema(
    id=STORED,
    name=TEXT( stored=True ),
    description=TEXT( stored=True ),
    long_description=TEXT( stored=True ),
    homepage_url=TEXT( stored=True ),
    remote_repository_url=TEXT( stored=True ),
    repo_owner_username=TEXT( stored=True ),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    full_last_updated=STORED )

tool_schema = Schema(
    name=TEXT( stored=True ),
    description=TEXT( stored=True ),
    owner=TEXT( stored=True ),
    id=TEXT( stored=True ),
    help=TEXT( stored=True ),
    version=TEXT( stored=True),
    repo_owner_username=TEXT( stored=True ),
    repo_id=STORED )
コード例 #27
0
import os, os.path
import reimport pickle
from whoosh.analysis import StemmingAnalyzer
from whoosh import index
from whoosh.fields import Schema, ID, TEXT, STORED, IDLIST


if not os.path.exists('indexdir'):
    os.mkdir("indexdir")

schema = Schema(name = TEXT(stored=True), award_list = TEXT(stored=True), track_list = TEXT(stored=True), wikilink = TEXT(stored=True))

ix = index.create_in("indexdir", schema)
writer = ix.writer()


#loading dictionaries
DICT_artist_awards = open("FINAL_artist_awards_dict.pkl", "rb")
artist_awards = pickle.load(DICT_artist_awards)
DICT_artist_awards.close()

DICT_artist_tracks = open("FINAL_artist_tracks_dict.pkl", "rb")
artist_tracks = pickle.load(DICT_artist_tracks)
DICT_artist_tracks.close()

DICT_wikipage = open("FINAL_wikipage_dict.pkl", "rb")
wikipage_links = pickle.load(DICT_wikipage)
DICT_wikipage.close()


print('LEN - awards > ', len(artist_awards))
コード例 #28
0
from solvertools.wordlist import WORDS
from solvertools.normalize import slugify, sanitize
from solvertools.util import data_path, corpus_path
from whoosh.fields import Schema, ID, TEXT, KEYWORD, NUMERIC
from whoosh.analysis import StandardAnalyzer
from whoosh.index import create_in
import nltk
import os
from tqdm import tqdm

schema = Schema(slug=ID,
                text=TEXT(stored=True, analyzer=StandardAnalyzer()),
                definition=TEXT(stored=True, analyzer=StandardAnalyzer()),
                length=NUMERIC)


def init_search_index():
    nltk.download('wordnet')
    from nltk.corpus import wordnet
    get_synset = wordnet._synset_from_pos_and_offset

    def get_adjacent(synset):
        return [
            name for pointer_tuples in synset._pointers.values()
            for pos, offset in pointer_tuples
            for name in get_synset(pos, offset).lemma_names()
        ]

    os.makedirs(data_path('search'), exist_ok=True)
    ix = create_in(data_path('search'), schema)
    writer = ix.writer(procs=4)
コード例 #29
0
import cgi

from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app

from whoosh import store
from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
from whoosh.index import getdatastoreindex
from whoosh.qparser import QueryParser, MultifieldParser
import logging

SEARCHSCHEMA = Schema(content=TEXT(stored=True))


class MainPage(webapp.RequestHandler):
    def get(self):
        self.response.out.write('<html><body>')
        self.response.out.write("""
          <form action="/search" method="get">
            <div><input name="query" type="text" value=""><input type="submit" value="Search"></div>
          </form>
        </body>
      </html>""")

        # Write the submission form and the footer of the page
        self.response.out.write("""
          <form action="/sign" method="post">
            <div><textarea name="content" rows="3" cols="60"></textarea></div>
            <div><input type="submit" value="Sign Guestbook"></div>
          </form>
        </body>
コード例 #30
0
ファイル: ir.py プロジェクト: nicysneiros/graph-dblp
    'note': TEXT(stored=True),
    'cdrom': TEXT(stored=True),
    'cite': TEXT(stored=True),
    'pages': TEXT(stored=True),
    'volume': TEXT(stored=True),
    'number': TEXT(stored=True),
    'journal': TEXT(stored=True),
    'publisher': TEXT(stored=True),
    'booktitle': TEXT(stored=True),
    'isbn': TEXT(stored=True),
    'series': TEXT(stored=True),
    'school': TEXT(stored=True),
    'type': TEXT(stored=True)
}

schema = Schema(**fields)
indexdir = tempfile.mkdtemp()
ix = create_in(indexdir, schema)
writer = ix.writer()


def add_document(doc):
    return
    attrs = {}
    for attrname in fields.keys():
        if hasattr(doc, attrname) and doc.__getattribute__(attrname):
            attrs[attrname] = doc.__getattribute__(attrname)
    writer.add_document(**attrs)


def commit():