Example #1
0
import re, json
import os
from config import db_path, indexdir_path
from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT, NUMERIC, NGRAMWORDS
from whoosh.index import create_in
from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()

if not os.path.exists(indexdir_path):
    os.mkdir(indexdir_path)

schema = Schema(artist_name=TEXT(stored=True, analyzer=analyzer),
                music_name=TEXT(stored=True, analyzer=analyzer),
                album_name=TEXT(stored=True, analyzer=analyzer),
                lyrics=TEXT(stored=True, analyzer=analyzer),
                comment_num=NUMERIC(stored=True, sortable=True),
                listen_num=NUMERIC(stored=True, sortable=True))

ix = create_in(indexdir_path, schema)
writer = ix.writer()

with open('All_Songs.json', 'r') as songs:
    song_list = songs.readlines()
    index = 0
    for i in song_list:
        try:
            if not 'TaiheHot' in i:
                continue
            b = json.loads(i)

            SongName = b['SongName']
Example #2
0
    repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
    path=TEXT(stored=True, analyzer=PATHANALYZER),
    content=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    modtime=STORED(),
    extension=TEXT(stored=True, analyzer=PATHANALYZER))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
    owner=TEXT(analyzer=EMAILADDRANALYZER),
    # this field preserves case of repository name for exact matching
    # and unique-ness in index table
    repository_rawname=ID(unique=True),
    repository=ID(stored=True, analyzer=ICASEIDANALYZER),
    author=TEXT(stored=True, analyzer=EMAILADDRANALYZER),
    message=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    parents=TEXT(),
    added=TEXT(analyzer=PATHANALYZER),
    removed=TEXT(analyzer=PATHANALYZER),
    changed=TEXT(analyzer=PATHANALYZER),
Example #3
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        initial_key_count = len(schema_fields)
        content_field_name = ''
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)

            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )
        return (content_field_name, Schema(**schema_fields))
from whoosh.analysis import StemmingAnalyzer
from whoosh import index, qparser
import json

from paths import here_path, merged_dir_path, top_dir
from movies import movies, WhichMovie, name_dict, movie_dict
from load_files import yarn_file_paths, parsed_scripts_file_paths, fandom_links_file_path

character_links = json.load(fandom_links_file_path.open('r', encoding='UTF-8'))
index_dir_path = here_path / "indexdir"

if not index_dir_path.exists():
	index_dir_path.mkdir()

schema = Schema(
	movie=NUMERIC(stored=True),
	character=NUMERIC(stored=True),
	quote=KEYWORD(stored=True),
)

ix = index.create_in(index_dir_path, schema)
writer = ix.writer()

print("Building index")

for script_file in parsed_scripts_file_paths:
	print(f'Building index for file "{script_file.relative_to(top_dir)}"')
	movie: WhichMovie = name_dict[script_file.stem]

	script_data = json.load(script_file.open('r', encoding="UTF-8"))
	print(f"Indexing ({movie}): ", end="")
Example #5
0
"""Module for searching the toolshed repositories"""
import logging

import whoosh.index
from whoosh import scoring
from whoosh.fields import KEYWORD, NUMERIC, Schema, STORED, TEXT
from whoosh.qparser import MultifieldParser
from whoosh.query import And, Every, Term

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters

log = logging.getLogger(__name__)

schema = Schema(id=NUMERIC(stored=True),
                name=TEXT(field_boost=1.7, stored=True),
                description=TEXT(field_boost=1.5, stored=True),
                long_description=TEXT(stored=True),
                homepage_url=TEXT(stored=True),
                remote_repository_url=TEXT(stored=True),
                repo_owner_username=TEXT(stored=True),
                categories=KEYWORD(stored=True, commas=True, scorable=True),
                times_downloaded=STORED,
                approved=STORED,
                last_updated=STORED,
                repo_lineage=STORED,
                full_last_updated=STORED)


class RepoWeighting(scoring.BM25F):
Example #6
0
def media_rebuild():
    print datetime.datetime.now()
    print 'media_rebuild'
    media_db = mysql_new.BaseDB(config.MYSQL_DEFINE_MEDIA)
    schema = Schema(movieid=ID(stored=True, unique=True),
                    title=TEXT(stored=True,
                               analyzer=analyzer_zhongwen,
                               field_boost=2.0),
                    pinyin_title=TEXT(stored=True,
                                      analyzer=analyzer_pinyin,
                                      field_boost=2.0),
                    director=KEYWORD(stored=True),
                    year=NUMERIC(stored=True, sortable=True),
                    score=NUMERIC(stored=True, sortable=True),
                    area=KEYWORD(stored=True),
                    description=TEXT(stored=True, field_boost=1.5),
                    pinyin_description=TEXT(stored=True, field_boost=1.0),
                    actor=KEYWORD(stored=True, field_boost=1.0),
                    pinyin_actor=TEXT(stored=True, field_boost=1.0),
                    genres=KEYWORD(stored=True, field_boost=1.0),
                    pinyin_genres=TEXT(stored=True, field_boost=1.0),
                    type=NUMERIC(stored=True),
                    source=NUMERIC(stored=True))
    SQL = '''SELECT `movieid`, `title`, `type`, `actor`, `genres`, `director`, `douban_score`, `introduction` as description, `year` FROM `media_info` WHERE `status`=1 AND type in ('movie', 'tv', 'teleplay', 'anime')
          '''
    res = media_db.query(SQL, ())
    if not res:
        return
    for info in res:
        if info.get('type') == 'movie':
            info['type'] = 1
        elif info.get('type') == 'teleplay':
            info['type'] = 2
        elif info.get('type') == 'tv':
            info['type'] = 3
        elif info.get('type') == 'anime':
            info['type'] = 4
        else:
            continue
    index_path = os.path.join(config.index_root_dir, 'media')
    if not os.path.exists(index_path):
        os.mkdir(index_path)
    #ix = create_in(index_path, schema=schema)
    storage = FileStorage(index_path)
    ix = storage.open_index()
    writer = ix.writer()
    for info in res:
        pinyin_title = ' '.join(lazy_pinyin(info.get('title').decode('utf8')))
        pinyin_description = ' '.join(
            lazy_pinyin(info.get('description').decode('utf8')))
        pinyin_actor = ''.join(info.get('actor', '').strip().split('/'))
        pinyin_actor = ' '.join(lazy_pinyin(pinyin_actor.decode('utf8')))
        pinyin_genres = ''.join(info.get('genres', '').strip().split('/'))
        pinyin_genres = ' '.join(lazy_pinyin(pinyin_genres.decode('utf8')))
        actor = ';'.join(info.get('actor', '').strip().split('/'))
        area = ';'.join(info.get('area', '').strip().split('/'))
        director = ';'.join(info.get('area', '').strip().split('/'))
        genres = ';'.join(info.get('genres', '').strip().split('/'))

        writer.add_document(movieid=info.get('movieid').decode('utf8'),
                            title=info.get('title').decode('utf8'),
                            pinyin_title=pinyin_title,
                            type=info.get('type'),
                            actor=actor.decode('utf8'),
                            pinyin_actor=pinyin_actor,
                            genres=genres.decode('utf8'),
                            pinyin_genres=pinyin_genres,
                            director=director.decode('utf8'),
                            score=info.get('douban_score'),
                            description=info.get('description').decode('utf8'),
                            pinyin_description=pinyin_description,
                            area=area.decode('utf8'),
                            year=info.get('year'))
    writer.commit(mergetype=writing.CLEAR)
Example #7
0
from ngrams import segment
from datetime import datetime
from urllib.parse import urlparse, urljoin
from lxml.html import document_fromstring
from lxml.html.clean import Cleaner

from whoosh import index
from whoosh.fields import Schema, TEXT, ID, NUMERIC, KEYWORD, NGRAMWORDS
from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField

schema = Schema(title=TEXT(stored=True),
                url=ID(stored=True, unique=True),
                desc=ID(stored=True),
                description=TEXT(stored=True),
                rank=NUMERIC(stored=True, numtype=float),
                raw=TEXT,
                content=TEXT,
                keywords=KEYWORD,
                internal_links=TEXT,
                external_links=TEXT,
                ngramwords=NGRAMWORDS)

_ix = None


class XTRExcetion:
    pass


def _is_etree(tree):
Example #8
0
# QUESTIONS TO DISCUSS (IN REPORT)
# 1. Do we want to keep doubles or stick with unique values? 
# DOUBLE HOUDEN, OMDAT EFFECT FREQUENCY VERANDEREN ONVOORSPELBAAR IS MET ESTABLISHED BM25.
# 2. Do we want to remove hyperlinks? YES, WE WOULD ADD EVEN LONGER DOCUMENTS, FURTHERMORE NOT ADD CONTENT TO TABLE, ONLY DOC GETS LONGER (DO NOT WANT THAT). ENTITIES ARE ALREADY INCLUDED, ABBREVIATION CLUTTERS RESULT.
# 3. Do we want to lemmatize/stem? (seems logical to do) STAAT UIT BIJ CLAUDIA
# 4. Why do we get better NDCG results without cleaning?
# 5. Which or group to use (OR or AND)?

# Example cleaned field: 
# https://en.wikipedia.org/wiki/Charlotte_Hornets_all-time_roster
# Denotes players who are currently on the Bobcats roster Denotes players who are currently on the Bobcats roster Denotes players who are currently on the Bobcats roster Denotes players who are currently on the Bobcats roster Denotes players who are currently on the Bobcats roster No No Jersey number Jersey number Pos Position Position G Basketball positions Basketball positions F Basketball positions C Basketball positions Pts Point basketball Point basketball Reb Rebound basketball Ast Assist basketball

SCHEMA = Schema(
    id = TEXT(stored=True),
    headers = TEXT(stored=True),
    numCols = NUMERIC(stored=True),
    page_title = TEXT(stored=True),
    numDataRows = NUMERIC(stored=True),
    section_title = TEXT(stored=True),
    numHeaderRows = NUMERIC(stored=True),
    caption = TEXT(stored=True),
    body = TEXT(stored=True),
    titles = TEXT(stored=True),
    caption_and_headers = TEXT(stored=True),
    all_concatenated = TEXT(stored = True))

def to_fields(identifier, table):
    fields = {}

    # Text values.
    fields['headers'] = clean_list(table['title'])
Example #9
0
    def test_build_attrs(self):
        schema = Schema()
        adapter = SAAdapter(SANotIndexable, schema)
        assert not adapter.indexable
        assert adapter.doc_attrs == {}

        adapter = SAAdapter(Entity, schema)
        assert adapter.indexable == False

        adapter = SAAdapter(SubclassEntityIndexable, schema)
        assert adapter.indexable
        assert set(adapter.doc_attrs) == {
            'object_key',
            'id',
            'name',
            'slug',
            'object_type',
            'text',
            'created_at',
            'updated_at',
            'name_prefix',
            'owner',
            'owner_name',
            'creator_name',
            'creator',
            'allowed_roles_and_users',
            'tag_ids',
            'tag_text',
        }
        assert all(lambda f: callable(f)
                   for f in adapter.doc_attrs.itervalues())

        assert set(schema.names()) == {
            'object_key',
            'id',
            'object_type',
            'name',
            'slug',
            'text',
            'created_at',
            'updated_at',
            'name_prefix',
            'owner',
            'owner_name',
            'creator_name',
            'creator',
            'allowed_roles_and_users',
            'tag_ids',
            'tag_text',
        }

        schema = Schema(id=NUMERIC(numtype=int,
                                   bits=64,
                                   signed=False,
                                   stored=True,
                                   unique=True), )
        adapter = SAAdapter(Indexable, schema)
        assert adapter.indexable
        assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'}
        assert all(lambda f: callable(f)
                   for f in adapter.doc_attrs.itervalues())

        assert set(schema.names()) == {'id', 'text', 'num', 'name'}
        assert isinstance(schema['text'], TEXT)
        assert isinstance(schema['num'], NUMERIC)
"""

import os
from pymongo import MongoClient
from bson.objectid import ObjectId
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, ID, NUMERIC, KEYWORD, TEXT
from whoosh.analysis import LanguageAnalyzer

# connect to MongoDB database
client = MongoClient('localhost:27017')
db = client.GregueriasData

es_ana = LanguageAnalyzer("es")  # Whoosh analyzer for Spanish

schema = Schema(id=NUMERIC(stored=True),
                text=TEXT(analyzer=es_ana, stored=True),
                wc=NUMERIC(stored=True),
                tags=KEYWORD(stored=True),
                x=NUMERIC(stored=True),
                y=NUMERIC(stored=True))

# Create index dir if it does not exist
if not os.path.exists("../whoosh_index"):
    os.mkdir("../whoosh_index")

index = create_in("../whoosh_index", schema)

# Fill index with posts from DB
posts = db.Greguerias.find()
writer = index.writer()
Example #11
0
if createCategory:
    if (not os.path.isfile(thumbImage)):
        print 'could not locate category thumb image'
        sys.exit(0)
    shutil.move(thumbImage, os.path.join(os.getcwd(), 'static/categories/', (refName + os.path.splitext(thumbImage)[1])))
    thumbImage = os.path.join('static/categories/', (refName + os.path.splitext(thumbImage)[1]))
    print thumbImage

# get the search engine, creating it if not already created
dbDir = os.path.join('searchdb/', refName)
if not os.path.exists(dbDir):
    if not createCategory:
        print 'please specify category description and image for new categories'
        sys.exit(0)
    os.makedirs(dbDir)
    schema = Schema(quote=TEXT(stored=True), reference=NUMERIC(stored=True))
    ix = create_in(dbDir, schema)
else:
    ix = open_dir(dbDir)
writer = ix.writer()


# database connection
db = MySQLdb.connect('127.0.0.1', 'quoteinsert', 'password', 'gifmaker')
cursor = db.cursor(MySQLdb.cursors.DictCursor)


# insert the category and media into the database
categoryID, mediaID = addCategoryToDatabase(fullName, description, thumbImage, refName, mediaName, vidFile, subFile)
#lastQuoteReference, categoryReference = addCategoryToDatabase(refName, vidFile, fullName, thumbImage, description)
Example #12
0
import os, sys, logging

log = logging.getLogger()

from config import settings
from whoosh.fields import Schema, TEXT, ID, DATETIME, KEYWORD, NUMERIC
from whoosh.index import create_in

item_schema = Schema(
    id = NUMERIC(stored=True, unique=True),
    guid = TEXT(stored=True),
    title = TEXT(stored=True),
    text = TEXT(stored=True),
    when = DATETIME(stored=True),
    tags = KEYWORD(stored=True)
)

def setup_index():
    if not os.path.exists(settings.index):
        os.makedirs(settings.index)
        ix = create_in(settings.index, item_schema)       
Example #13
0
                # caso in cui dentro alla lista type non ci sia publication e/o venue
                else:
                    # se è un tipo consentito lo aggiungo alla lista
                    for t in type:
                        if res.dic["type"] == t:
                            filtered_list.append(res)
        return filtered_list
    else:
        return results


if __name__ == "__main__":

    # ID considera il valore per la sua interezza, ideale per url.
    # i field che supportano le phrasal queries sono quelli con phrase=true
    schema = Schema(key=NUMERIC(stored=True), type=TEXT(stored=True), author=TEXT(stored=True, phrase=True),
                    title=TEXT(stored=True, phrase=True), year=TEXT(stored=True),
                    journal=TEXT(stored=True, phrase=True), ee=ID(stored=True), publisher=TEXT(stored=True))
    # limitatore dei doc risultanti, più è piccolo, più la query viene risolta velocemente
    resultLimiter = 60000

    # schema = Schema(key=NUMERIC(stored=True), type=TEXT(stored=True), author=TEXT(stored=True, phrase=True),
    #                 editor=TEXT(stored=True),
    #                 title=TEXT(stored=True, phrase=True),
    #                 booktitle=TEXT(stored=True, phrase=True), pages=TEXT(stored=True),
    #                 year=TEXT(stored=True), address=TEXT,
    #                 journal=TEXT(stored=True, phrase=True), volume=TEXT,
    #                 number=TEXT(stored=True), month=TEXT, url=ID(stored=True), ee=ID(stored=True),
    #                 cdrom=TEXT(stored=True), cite=TEXT,
    #                 publisher=TEXT(stored=True), note=TEXT(stored=True, analyzer=StemmingAnalyzer()),
    #                 crossref=TEXT(stored=True), isbn=TEXT, series=TEXT, school=TEXT,
Example #14
0
def populateDB():
    #variables para contar el número de registros que vamos a almacenar
    num_mods = 0
    num_etiquetas = 0

    #borramos todas las tablas de la BD
    Mod.objects.all().delete()
    Etiqueta.objects.all().delete()

    #Preparamos el esquema del índice
    schem = Schema(titulo=TEXT(stored=True),
                   descripcion=TEXT(stored=True),
                   fecha_actualizacion=DATETIME(stored=True),
                   etiquetas=TEXT(stored=True),
                   imagen=TEXT(stored=True),
                   fecha_publicacion=DATETIME(stored=True),
                   tamanyo=NUMERIC(stored=True),
                   puntuacion=NUMERIC(stored=True),
                   num_valoraciones=NUMERIC(stored=True),
                   suscriptores=NUMERIC(stored=True))
    if os.path.exists("Index"):
        shutil.rmtree("Index")
    os.mkdir("Index")
    ix = create_in("Index", schema=schem)

    #Nombre, Etiquetas, Descripción, Fecha publicación, Fecha actualización, Tamaño, Puntuación, NúmeroValoraciones, LinkCreador, Suscriptores, Imagen

    f = urllib.request.urlopen(
        "https://steamcommunity.com/workshop/browse/?appid=440900&searchtext=&childpublishedfileid=0&browsesort=totaluniquesubscribers&section=readytouseitems&actualsort=totaluniquesubscribers&p=1"
    )
    s = BeautifulSoup(f, "lxml")

    num_pags = int(
        s.find("div",
               class_="workshopBrowsePagingControls").find_all("a")[2].text)
    for i in range(1, num_pags + 1):
        print("Página " + str(i))
        f = urllib.request.urlopen(
            "https://steamcommunity.com/workshop/browse/?appid=440900&searchtext=&childpublishedfileid=0&browsesort=totaluniquesubscribers&section=readytouseitems&actualsort=totaluniquesubscribers&p="
            + str(i))
        s = BeautifulSoup(f, "lxml")
        lista_mods = s.find("div", class_="responsive_page_content").find(
            "div",
            class_="workshopBrowseItems").find_all("div",
                                                   class_="workshopItem")
        lista_link_mods = []
        writer = ix.writer()
        for mod in lista_mods:
            link = mod.find("a")["href"]
            imagen = mod.find("img", class_="workshopItemPreviewImage")["src"]
            lista_link_mods.append((link, imagen))

        for link in lista_link_mods:

            print(link[0])
            #Posible solución a errores 502 Bad Gateway y a errores de None Type por no abrir los html correctamente
            #Se emula un navegador por defecto, en este caso, Mozilla en su versión 5.0
            req = urllib.request.Request(link[0],
                                         headers={'User-Agent': 'Mozilla/5.0'})
            f = urllib.request.urlopen(req).read()
            #f = urllib.request.urlopen(link[0])
            s = BeautifulSoup(f, "lxml")
            #Nombre, Etiquetas, Descripción, Fecha publicación, Fecha actualización, Tamaño, Puntuación, NúmeroValoraciones, LinkCreador, Suscriptores, Imagen

            titulo = s.find("div", class_="workshopItemTitle").text
            descripcion_soup = s.find("div", class_="workshopItemDescription")
            descripcion = str(descripcion_soup)
            #desc_index = descripcion_soup.text
            div_stats = s.find(
                "div", class_="detailsStatsContainerRight").find_all("div")
            tamanyo = float("".join(div_stats[0].stripped_strings).split(
                sep=" MB")[0].replace(",", ""))
            fecha_separada = "".join(
                div_stats[1].stripped_strings).split(sep=" ")
            if len(fecha_separada) == 4:
                fecha_publicacion = datetime.strptime(
                    fecha_separada[0] + " " + fecha_separada[1] + ", " +
                    str(datetime.today().year) + " @ " + fecha_separada[3],
                    "%d %b, %Y @ %I:%M%p")
            else:
                fecha_publicacion = datetime.strptime(
                    "".join(div_stats[1].stripped_strings),
                    "%d %b, %Y @ %I:%M%p")
            try:
                fecha_separada = "".join(
                    div_stats[2].stripped_strings).split(sep=" ")
                if len(fecha_separada) == 4:
                    fecha_actualizacion = datetime.strptime(
                        fecha_separada[0] + " " + fecha_separada[1] + ", " +
                        str(datetime.today().year) + " @ " + fecha_separada[3],
                        "%d %b, %Y @ %I:%M%p")
                else:
                    fecha_actualizacion = datetime.strptime(
                        "".join(div_stats[1].stripped_strings),
                        "%d %b, %Y @ %I:%M%p")
            except:
                fecha_actualizacion = None

            imagen_puntuacion = s.find(
                "div",
                class_="fileRatingDetails").img["src"].split(sep="large")
            if imagen_puntuacion[0].find('5') != -1:
                puntuacion = 5
            elif imagen_puntuacion[0].find('4') != -1:
                puntuacion = 4
            elif imagen_puntuacion[0].find('3') != -1:
                puntuacion = 3
            elif imagen_puntuacion[0].find('2') != -1:
                puntuacion = 2
            elif imagen_puntuacion[0].find('1') != -1:
                puntuacion = 1
            elif imagen_puntuacion[0].find('0') != -1:
                puntuacion = 0
            else:
                puntuacion = None

            try:
                num_valoraciones = int("".join(
                    s.find("div", class_="numRatings").stripped_strings).split(
                        sep=" ratings")[0].replace(",", ""))
            except:
                num_valoraciones = None
            link_creador = s.find(
                "div", class_="breadcrumbs").find_all("a")[2]["href"]

            imagen = link[1]

            num_suscriptores = int("".join(
                s.find("table", class_="stats_table").find_all("tr")
                [1].find_all("td")[0].stripped_strings).replace(",", ""))

            etiquetas_soup = s.find_all("div", class_="workshopTags")
            etiquetas = []
            for etiqueta in etiquetas_soup:
                aux = "".join(etiqueta.stripped_strings).split(sep=":")[0]
                etiquetas.append(aux)

            if len(etiquetas) >= 1:
                etiquetas_index = ", ".join(etiquetas)
            else:
                etiquetas_index = ""
            #almacenamos en la BD

            lista_etiquetas_obj = []
            for etiqueta in etiquetas:
                etiqueta_obj, creado = Etiqueta.objects.get_or_create(
                    nombre=etiqueta)
                lista_etiquetas_obj.append(etiqueta_obj)
                if creado:
                    num_etiquetas = num_etiquetas + 1

            m = Mod.objects.create(titulo=titulo,
                                   descripcion=descripcion,
                                   fechaPublicacion=fecha_publicacion,
                                   fechaActualizacion=fecha_actualizacion,
                                   tamanyo=tamanyo,
                                   puntuacion=puntuacion,
                                   numeroValoraciones=num_valoraciones,
                                   linkCreador=link_creador,
                                   suscriptores=num_suscriptores,
                                   imagen=imagen)
            #añadimos la lista de etiquetas
            for e in lista_etiquetas_obj:
                m.etiquetas.add(e)

            writer.add_document(titulo=titulo,
                                descripcion=descripcion,
                                fecha_actualizacion=fecha_actualizacion,
                                etiquetas=etiquetas_index,
                                imagen=imagen,
                                fecha_publicacion=fecha_publicacion,
                                tamanyo=tamanyo,
                                puntuacion=puntuacion,
                                num_valoraciones=num_valoraciones,
                                suscriptores=num_suscriptores)
            num_mods = num_mods + 1
        writer.commit()
    return ((num_mods, num_etiquetas))
 def __init__(self):
     self.schema = Schema(id=NUMERIC(unique=True, stored=True), name=STORED(), name_tokenized=TEXT(stored=False, analyzer=WhooshConstants.tokenized_analyzer), name_stemmed=TEXT(stored=False, analyzer=WhooshConstants.stem_analyzer), name_normalized=TEXT(stored=False, analyzer=WhooshConstants.normalized_analyzer, field_boost=100.0))
Example #16
0
    def build_schema(self, fields):
        # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = WHOOSH_ID(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=getattr(field_class, "analyzer",
                                     StemmingAnalyzer()),
                    field_boost=field_class.boost,
                    sortable=True,
                )
                schema_fields[
                    field_class.index_fieldname].field_name = field_name

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Example #17
0
from os import makedirs

from re import compile as re_compile

from sys import stdout

from traceback import print_exc

from twisted.words.protocols.irc import CHANNEL_PREFIXES

from util import Mapping, argumentSplit, functionHelp, pastehelper
# TODO: perhaps put writer in a thread inside the new process while the batch-write is happening
#		so searches and log buffer can still be done while writing instead of filling up the interprocess
#		pipe/queue/socket/whatever while the writer is blocking

SCHEMA = Schema(id=NUMERIC(numtype=int, bits=64, stored=True, unique=True),
                timestamp=DATETIME(sortable=True, stored=True),
                nick=TEXT(stored=True),
                user=TEXT(stored=True),
                source=TEXT(stored=True),
                content=TEXT(stored=True))

OPTIONS = {
    "indexdir": (unicode, "Dir where log indexes are stored.", "logindex"),
}
REQUIRES = ("pbm_users", )
USERS_MODULE = None

SOURCE_REGEX = re_compile(r".*\bsource:.")
NICK_REGEX = re_compile(r".*\bnick:(.+)/b")
Example #18
0
def get_schema():
    return Schema(nome=TEXT(stored=True),
                  id=ID(stored=True),
                  lat=NUMERIC(stored=True),
                  lon=NUMERIC(stored=True))
Example #19
0
import os.path
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, NUMERIC
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer

schema = Schema(docID=NUMERIC(stored=True), contents=TEXT)
index_dir = "index"

if not os.path.exists(index_dir):
    os.makedirs(index_dir)

ix = create_in(index_dir, schema)

writer = ix.writer()
stemmizer = LancasterStemmer()
stopWords = set(stopwords.words('english'))

with open('doc/document.txt', 'r') as f:
    text = f.read()
    docs = text.split('   /\n')[:-1]
    for doc in docs:
        br = doc.find('\n')
        docID = int(doc[:br])
        doc_text = doc[br + 1:]

        table = str.maketrans('\n?.,!', '     ')
        doc_text_nomark = doc_text.translate(table)

        new_doc_text = ''
        for word in doc_text_nomark.split(' '):
Example #20
0
import os, time, threading
from whoosh.fields import Schema, KEYWORD, NGRAMWORDS, NUMERIC, TEXT
from whoosh.index import create_in, open_dir
from whoosh.writing import AsyncWriter
from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin, FieldAliasPlugin
#from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer
#from whoosh.query import *

#https://whoosh.readthedocs.io/en/latest/quickstart.html
schema = Schema(id=NUMERIC(stored=True, unique=True, signed=False),
                category=TEXT,
                title=NGRAMWORDS(2, 20, True, 2.0),
                ingredients=KEYWORD,
                content=NGRAMWORDS(4, 20))

#TODO: Synonyme https://whoosh.readthedocs.io/en/latest/api/lang/wordnet.html
search_path = "search"
ALWAYS_REBUILD = False
min_search_length = 2

if not os.path.exists(search_path):
    os.mkdir(search_path)


def rebuild_index():
    index = create_in(search_path, schema)
    writer = index.writer()
    writer.add_document(id=0, title="Test Words", content="super nice")
    writer.add_document(id=1, title="Apple Banana Cucumber")
    writer.add_document(id=2, title="Deck Elevator Floor", category="test")
    writer.add_document(id=3, title="Pen Pineapple Apple Pen")
Example #21
0
class TargetSchema(SchemaClass):
    """Fultext index schema for target strings."""

    pk = NUMERIC(stored=True, unique=True)
    target = TEXT()
    comment = TEXT()
Example #22
0
class PercsSchema(SchemaClass):
    """ Percs index definition.

    A Whoosh index schema class.  Ultimately the main things
    we want captured:

        * The text content of each page, of each file.
          (eg. When we get a result, we want to know which
            page the result was found.)
        * The name of the person this file is about.
        * The collection in which this file is a member.

    Everything else is just metadata. ;)

        * file content type. Maybe we won't just store PDFs one day?
        * date file was added.
        * possibly the original source of the docs? eg. NSW gov.
    """
    id = ID(stored=True, unique=True)
    collection = ID(stored=True)

    # NOTE: Conceptually, filenames are treated as 'unique'
    #  in the index. However, we treat each page as
    #  the actual index 'documents', so there are multiple
    #  documents with the same filename. This works okay, but
    #  when reindexing a file, we'll need to do every page again,
    #  or identify the specific page that needs updating.
    filename = ID(stored=True)
    page = NUMERIC(stored=True)

    # Keep content hashes, so we can check for modifications.
    # e.g. incremental reindex.
    # Using ID for file_hash, so it's possible to check if a
    # given file has already been added to the index before,
    # without having to trust the filename.
    # I can't see a similar need for the page hash, so
    # sticking with a STORED field (not searchable)...
    #
    # NOTE: Can't flag either of these as unique.
    #   Every page will have the same file_hash, and
    #   potentially, there could be pages with identical
    #   content (eg. "Intentially left blank" heh...)
    file_hash = ID(stored=True)
    page_hash = STORED

    date = DATETIME
    person = TEXT(stored=True)

    # TODO: Add the person's title / seat
    #   Currently when searching for an area, we count on it
    #   being picked up by the content field.  Maybe nice to
    #   have explicit titles added, in case they've handwritten stuff.
    # title = TEXT(stored=True)

    # For now, we're only using PDFs. But, one day... ?
    content_type = ID(stored=True)

    # Can't see an immediate use for this. But easy to capture
    # and is cheap, so ... why not?
    content_length = NUMERIC(stored=True)

    # Use the stemming analyzer, so the user can search
    # for plural/singular, or the 'ing'/'ier'/'ies', etc,
    # versions of each word & still get the hit.
    #
    # Store the content, so we can access hit highlights
    # painlessly (otherwise, will need to fetch from original
    # sources again later).
    content = TEXT(analyzer=StemmingAnalyzer(), stored=True)
Example #23
0
import unicodecsv as csv
from whoosh import index, sorting
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC
from whoosh.qparser import MultifieldParser

_schema = Schema(
    ror=STORED(),
    grid=STORED(),
    name=NGRAMWORDS(stored=False),
    aliases=NGRAMWORDS(stored=False),
    num_students=NUMERIC(int, sortable=True, stored=False),
    citation_score=NUMERIC(int, sortable=True, stored=False),
)

_index_path = 'data/ror-whoosh-index'


def _read_ror_csv_rows():
    rows = []
    with open('data/ror-metrics.csv') as ror_csv:
        reader = csv.DictReader(ror_csv)
        for row in reader:
            row['aliases'] = row['aliases'].split(
                u'###') if row['aliases'] else []
            row['num_students'] = int(
                row['num_students']) if row['num_students'] else None
            row['citation_score'] = float(
                row['citation_score']) if row['citation_score'] else None
            rows.append(row)
Example #24
0
    def __init__(self, index_dir, backend, user_name=None, acl_support=False, **kw):
        """
        Store params, create schemas.
        """
        self.index_dir = index_dir
        self.index_dir_tmp = index_dir + '.temp'
        self.backend = backend
        self.user_name = user_name # TODO use currently logged-in username
        self.acl_support = acl_support
        self.wikiname = u'' # TODO take from app.cfg.interwikiname
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            # TODO was: NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            NAME: ID(stored=True, field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # tokenized CONTENTTYPE from metadata
            # TODO was: CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            CONTENTTYPE: ID(stored=True),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata TODO: -> user ITEMID
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            # TODO was: ACL: TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True),
            ACL: ID(stored=True),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            EMAIL: ID(unique=True, stored=True),
            OPENID: ID(unique=True, stored=True),
        }
        latest_revs_fields.update(**userprofile_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema
Example #25
0
import logging

import whoosh.index
from whoosh import scoring
from whoosh.fields import KEYWORD, NUMERIC, Schema, STORED, TEXT
from whoosh.qparser import MultifieldParser
from whoosh.query import And, Every, Term

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters

log = logging.getLogger(__name__)

schema = Schema(
    id=NUMERIC(stored=True),
    name=TEXT(field_boost=1.7, stored=True),
    description=TEXT(field_boost=1.5, stored=True),
    long_description=TEXT(stored=True),
    homepage_url=TEXT(stored=True),
    remote_repository_url=TEXT(stored=True),
    repo_owner_username=TEXT(stored=True),
    categories=KEYWORD(stored=True, commas=True, scorable=True),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    repo_lineage=STORED,
    full_last_updated=STORED)


class RepoWeighting(scoring.BM25F):
Example #26
0
class SourceSchema(SchemaClass):
    """Fultext index schema for source and context strings."""
    pk = NUMERIC(stored=True, unique=True)
    source = TEXT()
    context = TEXT()
    location = TEXT()
Example #27
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Example #28
0
INDEX_BASE_DIR = "index/"

stemming_analyzer = StemmingAnalyzer(cachesize=-1)
schema = Schema(url=ID(stored=True, unique=True),
                path=ID(stored=True, unique=True),
                title=TEXT(stored=True),
                title_stem=TEXT(analyzer=stemming_analyzer),
                description=TEXT(),
                description_stem=TEXT(analyzer=stemming_analyzer),
                keywords=KEYWORD(),
                keywords_stem=KEYWORD(analyzer=stemming_analyzer),
                links_in_keywords=KEYWORD(),
                links_in_keywords_stem=KEYWORD(analyzer=stemming_analyzer),
                content=TEXT(),
                content_stem=TEXT(analyzer=stemming_analyzer),
                pagerank=NUMERIC(stored=True, sortable=True))


def index_docs(docs):
    index_documents(docs)
    log_prof_data(logger)


@profile
def index_documents(docs):
    msg = "Indexing documents"
    logger.info('%s %s', MSG_START, msg)
    try:

        if not os.path.isdir(INDEX_BASE_DIR):
            logger.info(
Example #29
0
def schema():

    schema = Schema(person=ID(stored=True),
        debate_no=TEXT(stored=True),
        sentiment_score=NUMERIC(stored=True, sortable=True),
        tags=KEYWORD(stored=True),
        sentence=TEXT(spelling=True, analyzer=StemmingAnalyzer(), stored=True))

    FIELD_KEYWORDS = 'keywords'
    FIELD_CONTENT = 'sentences'

    if not os.path.exists("index"):
        os.mkdir("index")
    ix = create_in("index", schema)



# create list of lists
    data = []
    for row in datareader:
        data.append(row)

# delete header
    del data[0]

# create list of dictionaries (using header terms as keys)
    transcript = []
    for row in data:
        dct = {}
        dct['party'] = row[0]
        dct['debateNo'] = row[1].decode('utf-8')
        dct['sentenceNo']=row[2]
        dct['sequenceNo']=row[3]
        dct['speaker']=row[4].decode('utf-8')
        dct['text']=row[5]
        transcript.append(dct)

# fix error in transcript for second Republican debate (WALKER's lines had been assigned to TRUMP or BUSH)
    for row in transcript:
        if row['party'] == 'rep' and row['debateNo']=='02' and row['text'].startswith('WALKER'):
            row['speaker'] = u'WALKER'
            text = bytearray(row['text'])
            del text[0:7]
            row['text'] = str(text)
        #print row

#for row in transcript:
    #print row

# encode sentences as unicode
    for row in transcript:
        row['text'] = row['text'].decode('utf-8')

    rep_speakers = ['CRUZ', 'RUBIO', 'KASICH', 'CARSON', 'FIORINA', 'PAUL', 'HUCKABEE', 'WALKER','TRUMP', 'CHRISTIE', 'BUSH']
    dem_speakers = ['CLINTON', 'SANDERS', 'CHAFEE', "O'MALLEY", 'WEBB']

# filtering out moderators
    transcript_no_moderators = []
    for row in transcript:
        if row['speaker'] in rep_speakers:
            transcript_no_moderators.append(row)
        if row['speaker'] in dem_speakers:
            transcript_no_moderators.append(row)

# Opening the index back up
    ix = open_dir("index")

# creating the testbatch
    testbatch=[]
    for row in transcript_no_moderators:
        testbatch.append(row)

    writer = ix.writer()
    for row in testbatch:
        writer.add_document(person=row['speaker'], debate_no =row['debateNo'], sentence=row['text'])
    writer.commit()
# sentiment score is already in the schema so calulate the sentiment score in this for loop and spit it back out
Example #30
0
def get_schema():
    return Schema(id=NUMERIC(stored=True, unique=True, numtype=int),
                  title=TEXT(stored=True),
                  content=TEXT(),
                  correspondent=TEXT(stored=True))