Ejemplo n.º 1
0
def schema():

    schema = Schema(person=ID(stored=True),
        debate_no=TEXT(stored=True),
        sentiment_score=NUMERIC(stored=True, sortable=True),
        tags=KEYWORD(stored=True),
        sentence=TEXT(spelling=True, analyzer=StemmingAnalyzer(), stored=True))

    FIELD_KEYWORDS = 'keywords'
    FIELD_CONTENT = 'sentences'

    if not os.path.exists("index"):
        os.mkdir("index")
    ix = create_in("index", schema)



# create list of lists
    data = []
    for row in datareader:
        data.append(row)

# delete header
    del data[0]

# create list of dictionaries (using header terms as keys)
    transcript = []
    for row in data:
        dct = {}
        dct['party'] = row[0]
        dct['debateNo'] = row[1].decode('utf-8')
        dct['sentenceNo']=row[2]
        dct['sequenceNo']=row[3]
        dct['speaker']=row[4].decode('utf-8')
        dct['text']=row[5]
        transcript.append(dct)

# fix error in transcript for second Republican debate (WALKER's lines had been assigned to TRUMP or BUSH)
    for row in transcript:
        if row['party'] == 'rep' and row['debateNo']=='02' and row['text'].startswith('WALKER'):
            row['speaker'] = u'WALKER'
            text = bytearray(row['text'])
            del text[0:7]
            row['text'] = str(text)
        #print row

#for row in transcript:
    #print row

# encode sentences as unicode
    for row in transcript:
        row['text'] = row['text'].decode('utf-8')

    rep_speakers = ['CRUZ', 'RUBIO', 'KASICH', 'CARSON', 'FIORINA', 'PAUL', 'HUCKABEE', 'WALKER','TRUMP', 'CHRISTIE', 'BUSH']
    dem_speakers = ['CLINTON', 'SANDERS', 'CHAFEE', "O'MALLEY", 'WEBB']

# filtering out moderators
    transcript_no_moderators = []
    for row in transcript:
        if row['speaker'] in rep_speakers:
            transcript_no_moderators.append(row)
        if row['speaker'] in dem_speakers:
            transcript_no_moderators.append(row)

# Opening the index back up
    ix = open_dir("index")

# creating the testbatch
    testbatch=[]
    for row in transcript_no_moderators:
        testbatch.append(row)

    writer = ix.writer()
    for row in testbatch:
        writer.add_document(person=row['speaker'], debate_no =row['debateNo'], sentence=row['text'])
    writer.commit()
# sentiment score is already in the schema so calulate the sentiment score in this for loop and spit it back out
Ejemplo n.º 2
0
whoosh.fields.NGRAM
TBD.
专业的用户可以自己创造他们自己的field类型

Creating a Schema



[python] view plain copy
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(from_addr=ID(stored=True),
                to_addr=ID(stored=True),
                subject=TEXT(stored=True),
                body=TEXT(analyzer=StemmingAnalyzer()),
                tags=KEYWORD)

如果没有使用一个构造器的关键字参数,可以省略后面的括号,(例如fieldname=TEXT代替fieldname=TEXT())Whoosh可以为你实例化

你也可以选择使用继承SchemaClass类来创建一个Schema类
[python] view plain copy
from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED

class MySchema(SchemaClass):
        path = ID(stored=True)
        title = TEXT(stored=True)
        content = TEXT
        tags = KEYWORD
你可以给create_in()或者create_index()函数一个类作为参数而不是他的实例
Ejemplo n.º 3
0
class Ingredient(db.Model):
    __searchable__ = ['name']
    __analyzer__ = StemmingAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(250), nullable=False)
Ejemplo n.º 4
0
class BmarkSchema(SchemaClass):
    bid = ID(unique=True, stored=True)
    description = TEXT
    extended = TEXT
    tags = KEYWORD
    readable = TEXT(analyzer=StemmingAnalyzer())
Ejemplo n.º 5
0
    from jieba.analyse import ChineseAnalyzer
except Exception as err:
    print(repr(err))
    ChineseAnalyzer = None

SITE_CFG['LANG'] = SITE_CFG.get('LANG', 'zh')

# Using jieba lib for Chinese.
if SITE_CFG['LANG'] == 'zh' and ChineseAnalyzer:
    TOR_SCHEMA = Schema(title=TEXT(stored=True, analyzer=ChineseAnalyzer()),
                        catid=TEXT(stored=True),
                        type=TEXT(stored=True),
                        link=ID(unique=True, stored=True),
                        content=TEXT(stored=True, analyzer=ChineseAnalyzer()))
else:
    TOR_SCHEMA = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()),
                        catid=TEXT(stored=True),
                        type=TEXT(stored=True),
                        link=ID(unique=True, stored=True),
                        content=TEXT(stored=True, analyzer=StemmingAnalyzer()))

WHOOSH_BASE = 'database/whoosh'
if os.path.exists(WHOOSH_BASE):
    TOR_IDX = open_dir(WHOOSH_BASE)
else:
    os.makedirs(WHOOSH_BASE)
    TOR_IDX = create_in(WHOOSH_BASE, TOR_SCHEMA)


def do_for_app(rand=True, kind='', doc_type=None):
    '''
Ejemplo n.º 6
0
    def get_absolute_url(self):
        return reverse('recommendations') + '?seed=%s' % self.paper_id

    def get_title(self):
        return self.title

    def set_rank(self, rank):
        self.rank = rank
        return self


paper_schema = Schema(
    paper_id=ID(stored=True),
    title=TEXT(stored=True),
    abstract=TEXT(analyzer=StemmingAnalyzer()),
    paper_url=TEXT(),
    aspect_tasks=KEYWORD,
    aspect_methods=KEYWORD,
    aspect_datasets=KEYWORD,
)

if settings.ASPECT_KNN_WHOOSH_INDEX_PATH and os.path.exists(settings.ASPECT_KNN_WHOOSH_INDEX_PATH):
    ix = index.open_dir(settings.ASPECT_KNN_WHOOSH_INDEX_PATH)  #'/Users/maos01/Desktop/special-docembeds-release-files/output/pwc/whoosh_index'
else:
    ix = None


# Load vector models
generic_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_GENERIC_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_GENERIC_W2V_PATH and os.path.exists(settings.ASPECT_KNN_GENERIC_W2V_PATH) else None #  '/Users/maos01/Downloads/specter.1k.w2v.txt'
task_vecs = KeyedVectors.load_word2vec_format(settings.ASPECT_KNN_TASK_W2V_PATH, limit=settings.ASPECT_KNN_LIMIT) if settings.ASPECT_KNN_TASK_W2V_PATH and os.path.exists(settings.ASPECT_KNN_TASK_W2V_PATH) else None
def main():
    file_content_doc1 = open("rural_min.txt").read()
    file_content_doc2 = open("science_min.txt").read()
    option = True
    while option:
        print("""
        1. Create Index.
        2. Query Index.
        3. Exit
        """)
        option = input("Please select an option...!")
        if option == "1":

            sent_tokenize_list1 = sent_tokenize(file_content_doc1,
                                                language='english')
            sent_tokenize_list2 = sent_tokenize(file_content_doc2,
                                                language='english')
            if not os.path.exists("index_task3_min"):
                os.mkdir("index_task3_min")

            my_analyzer = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | Lemmatizer()
            pos_tagger = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | PosTagger()
            wordnetsyn1 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets()
            wordnetsyn2 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets1()
            wordnetsyn3 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets2()
            wordnetsyn4 = RegexTokenizer() | StopFilter() | LowercaseFilter(
            ) | WordNetSynsets3()

            schema = Schema(id=ID(stored=True, unique=True),
                            standard=TEXT(stored=True,
                                          analyzer=StandardAnalyzer()),
                            stem_text=TEXT(stored=True,
                                           analyzer=StemmingAnalyzer()),
                            lemma=TEXT(stored=True, analyzer=my_analyzer),
                            pos_text=TEXT(stored=True, analyzer=pos_tagger),
                            hypernym=TEXT(stored=True, analyzer=wordnetsyn1),
                            hyponym=TEXT(stored=True, analyzer=wordnetsyn2),
                            holonym=TEXT(stored=True, analyzer=wordnetsyn3),
                            meronyms=TEXT(stored=True, analyzer=wordnetsyn4),
                            dependency=TEXT(analyzer=DependencyParser()))

            ix = index.create_in("index_task3_min", schema)
            writer = ix.writer()

            for sentence in sent_tokenize_list1:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            for sentence in sent_tokenize_list2:
                writer.add_document(standard=sentence,
                                    stem_text=sentence,
                                    lemma=sentence,
                                    pos_text=sentence,
                                    hypernym=sentence,
                                    hyponym=sentence,
                                    meronyms=sentence,
                                    holonym=sentence,
                                    dependency=sentence)
            writer.commit()

            print_index_details(ix)

            print("\n\n Index created with various features as its fields")

        elif option == "2":
            ix = index.open_dir("index_task3")

            with ix.searcher(weighting=whoosh.scoring.BM25F()) as searcher:
                og = qparser.OrGroup.factory(0.5)
                q = input("\n Insert a query...!")
                query_text = MultifieldParser([
                    "standard", "stem_text", "lemma", "pos_text", "hyponym",
                    "meronyms", "hypernym", "holonym"
                ],
                                              schema=ix.schema,
                                              group=og).parse(q)
                results = searcher.search(query_text, limit=10)
                for i, hit in enumerate(results):
                    print(results.score(i), hit["standard"], sep=":")
                    print("\n")

        elif option == "3":
            print("\n Goodbye")
            sys.exit(0)
            option = None
        else:
            print("\n Not valid choice try again...!")
Ejemplo n.º 8
0
import os.path
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.analysis import StemmingAnalyzer
from whoosh.lang.snowball import english
from whoosh.lang.porter2 import stem
# from whoosh.lang.paicehusk import PaiceHuskStemmer

# Use PorterStemmer2 Algorithm in indexing
stem_ana = StemmingAnalyzer(stemfn=stem)

schema = Schema(docID=NUMERIC(stored=True), contents=TEXT(analyzer=stem_ana))
index_dir = "index"

if not os.path.exists(index_dir):
    os.makedirs(index_dir)

ix = create_in(index_dir, schema)

writer = ix.writer()

with open('doc/document.txt', 'r') as f:
    text = f.read()
    docs = text.split('   /\n')[:-1]
    for doc in docs:
        br = doc.find('\n')
        docID = int(doc[:br])
        doc_text = doc[br + 1:]
        writer.add_document(docID=docID, contents=doc_text)

writer.commit()
Ejemplo n.º 9
0
class Recipe(db.Model):
    __searchable__ = ['title', 'description', 'calories']
    __analyzer__ = StemmingAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    layout = db.Column(db.String(15))
    title = db.Column(db.String(100), unique=True, nullable=False)
    title_formatted = db.Column(db.String(100))
    filename = db.Column(db.String(100))
    image_path = db.Column(db.String(104))
    image_credit = db.Column(db.String(150))
    source = db.Column(db.String(150))
    description = db.Column(db.String(750))
    prep = db.Column(db.String(10))
    cook = db.Column(db.String(10))
    ready = db.Column(db.String(10))
    servings = db.Column(db.String(5))
    calories = db.Column(db.String(20))
    file_last_modified = db.Column(db.DateTime)

    tags = db.relationship('Tag',
                           secondary=recipe_tag,
                           lazy=True,
                           backref=db.backref('recipe', lazy=True))
    ingredients = db.relationship('Ingredient',
                                  secondary=recipe_ingredient,
                                  lazy=True,
                                  backref=db.backref('recipe', lazy=True))
    directions = db.relationship('Direction',
                                 secondary=recipe_direction,
                                 lazy=True,
                                 backref=db.backref('recipe', lazy=True))
    notes = db.relationship('Note',
                            secondary=recipe_note,
                            lazy=True,
                            backref=db.backref('recipe', lazy=True))

    def api_model(self):
        tags = []
        for tag in self.tags:
            tags.append(tag.name)

        ingredients = []
        for ingredient in self.ingredients:
            ingredients.append(ingredient.name)

        directions = []
        for direction in self.directions:
            directions.append(direction.name)

        notes = []
        for note in self.notes:
            notes.append(note.name)

        model = {
            'id': self.id,
            'layout': self.layout,
            'title': self.title,
            'title_formatted': self.title_formatted,
            'filename': self.filename,
            'image_path': self.image_path,
            'image_credit': self.image_credit,
            'source': self.source,
            'description': self.description,
            'prep': self.prep,
            'cook': self.cook,
            'ready': self.ready,
            'servings': self.servings,
            'calories': self.calories,
            'file_last_modified': self.file_last_modified,
            'tags': tags,
            'directions': directions,
            'ingredients': ingredients,
            'notes': notes
        }
        return model

    def __repr__(self):
        return f'<Recipe: {self.title}>'
Ejemplo n.º 10
0
def FragmenterAnalyzer():
    ret = StemmingAnalyzer(minsize=0, stoplist=None)
    return ret
Ejemplo n.º 11
0
from whoosh.writing import AsyncWriter

parser = argparse.ArgumentParser()
parser.add_argument("data", type=str)
parser.add_argument("-num_docs", type=int, default=None)
parser.add_argument("-threads", type=int,
                    default=1)  # seems using more than 1 thread may be broken?
parser.add_argument("-reload", action="store_true")
parser.add_argument("-migrate_url_to_text_field", action="store_true")
args = parser.parse_args()

schema = Schema(
    docid=ID(stored=True),
    url=ID(stored=True),
    title=TEXT(stored=True,
               analyzer=StemmingAnalyzer()),  # maybe no stemming here?
    body=TEXT(analyzer=StemmingAnalyzer()),
)

index_dir = "data/msmarcoidx" if args.num_docs is None else "data/quickidx"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)
    index.create_in(index_dir, schema)
    args.reload = True

storage = FileStorage(index_dir)
# Open an existing index
ix = storage.open_index()

if args.migrate_url_to_text_field:
    writer = ix.writer()
Ejemplo n.º 12
0
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \
                  only_title_flag = 0, \
                  directory_containing_the_index  = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \
                  query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \
                  gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \
                  doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \
                  conf_label = "Not Specified",
                  mrr_eps = .32, \
                  k_interval_for_nDCG = range(1,151)):
   
    
    ###
    ### Create a Schema 
    ###
    schema = Schema(id=ID(stored=True), \
                    title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer))
    
    ###
    ### Create an empty-Index 
    ### according to the just defined Schema ;)
    ### 
    ix = create_in(directory_containing_the_index, schema)
    
    
    ###
    ### Get the query set (reset index due to missing values in the IDs)
    ###
    query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index()
    
    
    ###
    ### Get the ground truth (little manipulation to group by query and allign IDs)
    ###
    gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t")
    gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict()
    gt = defaultdict(list)
    j = 1
    for i in range(len(gt_tmp)):
        while(gt[i] == []):
            try:
                gt[i] = gt_tmp[j]
                j+=1
            except KeyError:
                j += 1
    
    
    
    number_of_queries = len(query_set)
    num_of_docs = 1400
    
    ###
    ### We'll iterate on the following lists to swicth SE scoring function and get their names
    ###
    scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()]
    scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list]
    
    
    ###
    ### Fill the Index
    ###
    writer = ix.writer()
    for doc in range(num_of_docs):
        id_ = str(doc+1)
        title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html")
        writer.add_document(id=id_, title = title, content = content)
    writer.commit()
    
    
    
    ###
    ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config
    ###
    results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)])
    
   
    evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries
    ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs

    ###
    ### Run the SEs
    ###
    for idx_s,scorer in enumerate(scoring_functions_list):
        for idx,query in enumerate(query_set["Query"]):
            
            input_query = query
            
            ###
            ### Select a Scoring-Function
            ###
            scoring_function = scorer
            
            ###
            ### Create a QueryParser for 
            ### parsing the input_query based on user SE choosen configuration.
            ###
            if multifield_flag:
                qp = MultifieldParser(["title","content"], ix.schema)
                parsed_query = qp.parse(input_query)# parsing the query
            else:
                if only_title_flag:
                    qp = SimpleParser("title", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                else:
                    qp = SimpleParser("content", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                
            ###
            ### Create a Searcher for the Index
            ### with the selected Scoring-Function 
            ###
            searcher = ix.searcher(weighting=scoring_function)
            
            ###
            ### Perform a Search and store results
            ###
            results = searcher.search(parsed_query, limit=max_res)
            results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results]
            searcher.close()
        mrr_res = mrr(results_mat[:,:,idx_s],gt)
        
        if mrr_res >= mrr_eps:
            
            ###
            ### Compute and summarize R-precision distro
            ###
            r_res = r_precision(results_mat[:,:,idx_s],gt)
            mean = np.mean(list(r_res.values()))
            first_q = np.percentile(list(r_res.values()),25)
            third_q = np.percentile(list(r_res.values()),75)
            median = np.median(list(r_res.values()))
            minr = min(list(r_res.values()))
            maxr = max(list(r_res.values()))
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr]
            
            ###
            ### Compute nDCG@k for varying k and for each scoring function
            ###
            for k in k_interval_for_nDCG:
                tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values()))
                ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res)
            
        else:
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res]
        
        ###
        ### Just to see what's happening
        ###
        print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res))
        
    return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs 
Ejemplo n.º 13
0
# app.config['SECRET_KEY']='f6eeaa4486447025a35ab182035a34a0'
# app.config['SQLAlCHEMY_DATABASE_URI']='sqlite:///site.db'
# db = SQLAlchemy(app)

# from ed_main import routes

# converting your app into a package structure.
from flask import Flask
from flask_wtf.csrf import CSRFProtect, CSRFError
from flask_sqlalchemy import SQLAlchemy
from flask_bcrypt import Bcrypt
from whoosh.analysis import StemmingAnalyzer
import flask_whooshalchemy
from flask_admin import Admin

app = Flask(__name__)
csrf = CSRFProtect(app)
app.config['SECRET_KEY'] = 'f6eeaa4486447025a35ab182035a34a0'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///site.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONAS'] = True
app.config['WHOOSH_BASE'] = 'whoosh'
app.config['WHOOSH_ANALYZER'] = StemmingAnalyzer()
app.config['FLASK_ADMIN_SWATCH'] = 'cerulean'

db = SQLAlchemy(app)
bcrypt = Bcrypt(app)
admin = Admin(app, name='Project Ed', template_mode='bootstrap3')

from ed_main import routes
from ed_main import admin_views
Ejemplo n.º 14
0
def create_schema():
    schema = Schema(doc_text=TEXT(analyzer=StemmingAnalyzer(), stored=True))
    return schema
Ejemplo n.º 15
0
import os
import os.path
import datetime
from whoosh import index
from whoosh import query
from whoosh.fields import Schema, TEXT, ID, STORED, DATETIME, KEYWORD, NUMERIC
from whoosh.analysis import StemmingAnalyzer
from get_law_fields import list_from_file, get_fields
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.query import Query, Term, And

# import stopwords
with open("scripts/search_static/stopwords.txt", 'r') as f:
    stopwords = sorted(list(f.read().split('\n')))

lang_ana = StemmingAnalyzer(stoplist=stopwords)

# CREATE A SCHEMA
"""
The schema defines the fields that each document 
(i.e. law in most cases) may contain. 

law_name -- name of the document. Searchable and stored.
law_body -- the intro and articles of a law. Searchable only.
law_num_date -- the number of the law and the exact date. Searchable and stored.
pub_year -- the date of the Official Gazette publication.
article_one -- title and first few sentences of article one. Stored only for displaying in search results.

"""

schema = Schema(
Ejemplo n.º 16
0
"""
Created on Fri Dec  1 14:14:36 2017

@author: francruz

"""
import pandas as pd
import whoosh
import csv

# Creating the schema
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
schema = Schema(asin=KEYWORD(stored=True, scorable=True, sortable=True),
                helpful=STORED,
                reviewText=TEXT(analyzer=StemmingAnalyzer(),
                                phrase=False,
                                stored=True),
                overall=TEXT(analyzer=StemmingAnalyzer(), phrase=False),
                reviewTime=ID(stored=True),
                title=TEXT(analyzer=StemmingAnalyzer(),
                           phrase=False,
                           stored=True),
                price=STORED,
                brand=KEYWORD(stored=True),
                reviewLength=STORED,
                reviewWords=STORED,
                avgWordLength=STORED,
                expresiveness=STORED,
                ratingDelta=STORED,
                priceDelta=STORED)
Ejemplo n.º 17
0
          imageURL text,
          price numeric,
          rating numeric,
          noOfReviews numeric,
          savings numeric,
          percentageSavings numeric,
          productDesc text,
          reviewPolarity numeric,
          countryOfOrigin text,
          overview text)''')
c.close()

# initialise sentic net
sn = SenticNet()
# does stemming, removes accents so you can match words like cafe, facade etc and removes stopwords
hsn_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | StopFilter()

SCHEMA = Schema(
    filename=ID(unique=True, stored=True, analyzer=hsn_analyzer),
    content=TEXT(analyzer=hsn_analyzer, spelling=True),
    price=NUMERIC(sortable=True, stored=True),
    rating=NUMERIC(sortable=True, stored=True),
    noOfReviews=NUMERIC(sortable=True, stored=True),
    savings=NUMERIC(sortable=True, stored=True),
    percentageSavings=NUMERIC(sortable=True, stored=True),
    review=TEXT(analyzer=hsn_analyzer, spelling=True),
    productDesc=TEXT(stored=True),
    reviewPolarity=NUMERIC(sortable=True, stored=True),
    countryOfOrigin=TEXT(sortable=True, stored=True),
    overview=TEXT(stored=True),
)
Ejemplo n.º 18
0
from whoosh.index import create_in
from whoosh import index
from bs4 import BeautifulSoup
from whoosh import qparser
from whoosh.qparser import QueryParser
import re,os,codecs,sys

#Function that removes all the lines with tags style and script and document and head and title and also comments
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->',element.encode('utf-8')):
        return False
    return True

schema = Schema(id=ID(stored=True),img=TEXT(stored=True),title=TEXT(stored=True),h1=TEXT(analyzer=StemmingAnalyzer()),content=TEXT(analyzer=StemmingAnalyzer(), stored = True))
dir = os.listdir('dataset')
ix = create_in("index", schema)
for i, l in enumerate(dir):
	p = "dataset/"+l
	html = codecs.open( p , "r", "utf-8" ).read()
	soup = BeautifulSoup(html, 'html.parser')
	tit = u''
	tit += soup.title.string
	imgs = soup.find('h1').find_all_next('img')[0]
	im = u'https:'
	im += imgs["src"]
	texts = soup.findAll(text=True)
	visible_texts = filter(visible, texts)
	div = soup.find("div", {"id": "content"})
	headers = div.find_all(['h1', 'h2', 'h3'])
Ejemplo n.º 19
0
import sqlalchemy
from inspect import isclass
from flask_sqlalchemy import models_committed
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.inspection import inspect
from sqlalchemy.types import Boolean, Date, DateTime, Float, Integer, Text
from whoosh import index as whoosh_index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN, DATETIME, ID, NUMERIC, TEXT
from whoosh.fields import Schema as _Schema
from whoosh.qparser import AndGroup, MultifieldParser, OrGroup
from .backends import BaseBackend, logger, relation_column

DEFAULT_WHOOSH_INDEX_NAME = 'msearch'
DEFAULT_ANALYZER = StemmingAnalyzer()
DEFAULT_PRIMARY_KEY = 'id'

if sys.version_info[0] < 3:
    str = unicode


class Schema(object):
    def __init__(self, table, analyzer=None):
        self.table = table
        self.analyzer = analyzer
        self.schema = _Schema(**self.fields)

    @property
    def fields(self):
        model = self.table
Ejemplo n.º 20
0
import whoosh
import sqlalchemy
import flask_sqlalchemy
import whoosh.index
from whoosh import fields as whoosh_fields
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser import OrGroup, AndGroup, MultifieldParser
from whoosh.filedb.filestore import RamStorage
from whoosh.writing import AsyncWriter
from sqlalchemy import types as sql_types
from sqlalchemy.orm import EXT_CONTINUE

logger = logging.getLogger(__name__)

# DEFAULTS
DEFAULT_WHOOSH_ANALYZER = StemmingAnalyzer()
DEFAULT_WHOOSH_INDEX_PATH = os.path.join(os.path.abspath(os.getcwd()),
                                         '.indexes')

UPDATE_FIELDS = ('update', 'insert')
TEXT_TYPES = (sql_types.String, sql_types.Unicode, sql_types.Text)
DATE_TYPES = (sql_types.DateTime, sql_types.Date)
NUM_TYPES = (sql_types.Integer, sql_types.BigInteger, sql_types.SmallInteger,
             sql_types.Float, sql_types.Binary)


class WhooshAlchemyError(Exception):
    """ Base exception class for Flask-WhooshAlchemy3 """


class QueryProxy(flask_sqlalchemy.BaseQuery):
Ejemplo n.º 21
0
class WhooshConstants():
    index_dir = configuration.get('whoosh_index_dir')
    tokenized_analyzer = StandardAnalyzer(stoplist=None)
    normalized_analyzer = IDTokenizer() | SubstitutionFilter(
        r"[\s/,_'-]", "") | LowercaseFilter()
    stem_analyzer = StemmingAnalyzer(r"[\s/,_'-]", gaps=True, stoplist=None)
Ejemplo n.º 22
0
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NGRAM
from whoosh.analysis import StemmingAnalyzer, NgramWordAnalyzer
from whoosh.qparser import QueryParser
from whoosh.index import create_in, open_dir

os.chdir(os.path.dirname(__file__))
ix_dir = os.path.join(os.getcwd(), 'dir_indices')

b_indexing = False
if b_indexing:
    if os.path.isdir(ix_dir):
        shutil.rmtree(ix_dir)
    os.mkdir(ix_dir)
    schema = Schema(code=ID(stored=True),
                    name=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                    note=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                    iid=ID(stored=True))

    ix = create_in(ix_dir, schema)
    writer = ix.writer()

    cn = sqlite3.connect('mycis.db')
    cr = cn.cursor()

    print('creating indices ...')
    start_time = time.time()
    for r in cr.execute('select * from diag').fetchall():
        iid, code, name, name_zh = r
        # remove . in icd10
        writer.add_document(code=code.replace('.', ''),
Ejemplo n.º 23
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
from whoosh.support.charset import accent_map

BACKGROUND_JOB_KEY = "updateIndex"
UPDATE_INDEX_COMMAND = "update_index.py"
INDEX_PREFIX = "bookmarks-"
INDEXING_SETTING = "indexing"
CURRENT_INDEX_SETTING = "currentIndex"
INDEX_FRESH_CACHE = "freshIndex"
_N_GRAM_FIELD = "contentNGram"
_TEXT_FIELD = "contentText"
_CHILDREN_KEY = "children"

_BLUE_INDEX = "blue"
_GREEN_INDEX = "green"

_TEXT_ANALYZER = StemmingAnalyzer() | CharsetFilter(accent_map)
_N_GRAM_ANALYZER = analysis.NgramWordAnalyzer(minsize=2, maxsize=2)


class BookmarkSchema(fields.SchemaClass):
    contentNGram = TEXT(stored=False, analyzer=_N_GRAM_ANALYZER, phrase=False)
    contentText = TEXT(stored=False, analyzer=_TEXT_ANALYZER, phrase=True)
    urlSize = NUMERIC(signed=False, sortable=True, default=999)
    name = STORED()
    path = STORED()
    profile = STORED()
    url = STORED()
    icon = STORED()


class BookmarkIndex:
Ejemplo n.º 25
0
def BuildHelpIndex():

    if os.path.exists(indexDir):
        shutil.rmtree(indexDir, ignore_errors=True)
    os.mkdir(indexDir)

    stemmingAnalyzer = StemmingAnalyzer()
    schema = Schema(path=ID(stored=True, unique=True),
                    section=TEXT(stored=True),
                    title=TEXT(stored=True, analyzer=stemmingAnalyzer),
                    level=NUMERIC(stored=True),
                    content=TEXT(stored=True, analyzer=stemmingAnalyzer))
    ix = create_in(indexDir, schema)
    writer = ix.writer()

    titleTags = set([u'h1', u'h2', u'h3', u'h4', u'h5'])

    newLines = re.compile('\n+')
    nonNumeric = re.compile(r'[^\d]')

    def addDocument(fname, section, lastTitle, textCur):
        # print u'addDocument: lastTitle={}'.format(lastTitle)
        if lastTitle and textCur:
            section = '|'.join(section) if section else lastTitle.get_text()
            # print u'Indexing: {}: {}'.format(os.path.basename(fname), section)
            content = newLines.sub(u'\n', u'\n'.join(textCur))
            writer.add_document(path=os.path.basename(fname) + u'#' +
                                lastTitle['id'],
                                title=lastTitle.get_text(),
                                section=section,
                                level=int(nonNumeric.sub(u'', lastTitle.name)),
                                content=content)

    # Extract content sections from the html pages.
    for f in glob.iglob(os.path.join(htmlDocDir, '*.html')):
        doc = BeautifulSoup(open(f).read(), 'html.parser')
        div = doc.find('div', class_='content')
        if not div:
            continue

        lastTitle = None
        textCur = []
        section = []
        for child in div.contents:
            try:
                tag = child.name
            except:
                tag = None

            if tag not in titleTags:
                try:
                    textCur.append(child.get_text())
                except:
                    pass
                continue

            addDocument(f, section, lastTitle, textCur)

            iSection = int(int(nonNumeric.sub('', tag))) - 1
            section = section[:iSection]
            section.append(child.get_text())

            lastTitle = child
            textCur = []

        addDocument(f, section, lastTitle, textCur)

    writer.commit()
Ejemplo n.º 26
0
# Copyright (C) 2013, Thomas Leonard
# See the COPYING file for details, or visit http://0install.net.
#
# This version for 0mirror is based on original code for 0install:
# http://thread.gmane.org/gmane.comp.file-systems.zero-install.devel/3847

import os
import logging

from whoosh.index import create_in, open_dir
from whoosh import fields
from whoosh.analysis import StemmingAnalyzer

from zeroinstall.injector.namespaces import XMLNS_IFACE

sa = StemmingAnalyzer()
schema = fields.Schema(uri=fields.ID(unique=True, stored=True),
                       baseuri=fields.KEYWORD(field_boost=10.0,
                                              lowercase=True),
                       name=fields.KEYWORD(stored=True,
                                           field_boost=50.0,
                                           lowercase=True),
                       summary=fields.TEXT(stored=True, field_boost=5.0),
                       description=fields.TEXT(analyzer=sa),
                       category=fields.KEYWORD(stored=True),
                       homepage=fields.STORED)


class Indexer:
    def __init__(self, config, index_dir):
        self.config = config
Ejemplo n.º 27
0
    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name), terms,
                        sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Ejemplo n.º 28
0
class ObjectD(db.Model, BlogishBlob):
    __tablename__ = 'objectD'
    __searchable__ = ['title']
    __analyzer__ = StemmingAnalyzer() | DoubleMetaphoneFilter()
Ejemplo n.º 29
0
class Direction(db.Model):
    __searchable__ = ['name']
    __analyzer__ = StemmingAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(1000), nullable=False)
Ejemplo n.º 30
0
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import SimpleAnalyzer, StandardAnalyzer, StemmingAnalyzer, FancyAnalyzer

if len(sys.argv) != 3:
    sys.exit(
        '\nInputError: the user must enter an analyzer and the csv file path to index.\n'
        'EX: "SimpleAnalyzer ./part_1/Cranfield_DATASET/docs_table.csv"\n\n'
        'The user can choose from the following analyzer methods:\n\n'
        '"SimpleAnalyzer": it is a lower case filter\n\n'
        '"StandardAnalyzer": it is a lower case filter and  stop-words filter\n\n'
        '"StemmingAnalyzer": it is a lower case filter, stop-words filter and stemming filter\n\n'
        '"FancyAnalyzer": it is a lower case, stop-words, stemming filter and split words into subwords when it'
        'is useful\n')

with open(sys.argv[2], 'r') as csv_file:
    reader = csv.reader(csv_file, delimiter=' ')
    schema_fields = next(reader)[0].split(',')

if sys.argv[1] == 'SimpleAnalyzer': analyzer = SimpleAnalyzer()
elif sys.argv[1] == 'StandardAnalyzer': analyzer = StandardAnalyzer()
elif sys.argv[1] == 'StemmingAnalyzer': analyzer = StemmingAnalyzer()
elif sys.argv[1] == 'FancyAnalyzer': analyzer = FancyAnalyzer()

schema = Schema(id=ID(stored=True))
for field in schema_fields[1:]:
    schema.add(field, TEXT(stored=False, analyzer=analyzer))

index_dir = os.path.dirname(sys.argv[2]) + '/' + sys.argv[1]
os.mkdir(index_dir)
create_in(index_dir, schema)