def __init__(self, stored=False, unique=False, expression=None, field_boost=1.0, spelling=False): """ :param stored: Whether the value of this field is stored with the document. :param unique: Whether the value of this field is unique per-document. :param expression: The regular expression object to use to extract tokens. The default expression breaks tokens on CRs, LFs, tabs, spaces, commas, and semicolons. """ expression = expression or re.compile(r"[^\r\n\t ,;]+") self.analyzer = RegexAnalyzer(expression=expression) self.format = formats.Existence(field_boost=field_boost) self.stored = stored self.unique = unique self.spelling = spelling
#!/usr/bin/env #coding:utf-8 from whoosh.fields import * from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT from whoosh.index import create_in from whoosh.index import open_dir from whoosh.qparser import QueryParser import os.path import whoosh from whoosh.analysis import RegexAnalyzer import csv from whoosh import qparser analyzer = RegexAnalyzer(r'([\u4e00-\u9fa5])|(\w+(\.?\w+)*)') #中文语法分析器 def createIndexs(dirName): schema = Schema(id=NUMERIC(sortable=True), views=KEYWORD(stored=True), semtiment=TEXT(stored=True), content=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists(dirName): os.mkdir(dirName) ix = create_in(dirName, schema) dic = {} for line in open('Test.csv'): id, content = line.split('\t') dic[id] = content writer = ix.writer() reader = csv.reader(open('result_bs.csv'))
# num_added_records_so_far += 1 if (num_added_records_so_far % 100 == 0): print(" num_added_records_so_far= " + str(num_added_records_so_far)) # writer.commit() # it is necessary to store the index once filled in_file.close() # it is necessary to close the .csv file ''' Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv) ''' analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(), FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer', 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer', 'LanguageAnalyzer'] # analyzers names csv_names = ['Cranfield', 'Time'] # file names # start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv) for name in csv_names: print(name, '\n\n') path = "C:./"+name+"_DATASET" # get the path where the .csv is stored for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes
import time import hashlib from urllib import unquote from searcher.models import config from slutils import mysql_new import os import datetime from searcher.models import formatURL from whoosh.analysis import RegexAnalyzer from whoosh.analysis import StandardAnalyzer from jieba.analyse import ChineseAnalyzer from whoosh.analysis import LanguageAnalyzer from jieba.analyse import ChineseAnalyzer from collections import defaultdict analyzer_zhongwen = ChineseAnalyzer() analyzer_pinyin = RegexAnalyzer() def pub_rebuild(): print datetime.datetime.now() print 'pub_rebuild' pub_db = mysql_new.BaseDB(config.MYSQL_DEFINE_PUB) schema = Schema( uid=ID(stored=True, unique=True), title=TEXT(stored=True, analyzer=analyzer_zhongwen), pinyin_title=TEXT(stored=True, analyzer=analyzer_pinyin), icon_url=ID(stored=True), description=STORED, v_status=NUMERIC(stored=True), ) SQL = '''SELECT `uid`, `title`, `icon_url`, `description`, `v_status` FROM `pp_category_info`
def __init__(self, stored = False, unique = False, expression = None): expression = expression or re.compile(r"[^\r\n\t ,;]+") analyzer = RegexAnalyzer(expression = expression) self.format = Existence(analyzer = analyzer) self.stored = stored self.unique = unique