def test(): # Creating a client with a given index name client = Client('myIndex') # Creating the index definition and schema client.drop_index() client.create_index([TextField('title', weight=5.0), TextField('body')]) # Indexing a document client.add_document( 'doc1', title='RediSearch', body='Redisearch implements a search engine on top of redis') # Simple search res = client.search("search engine") # the result has the total number of results, and a list of documents print res.total # "1" print res.docs[0] # Searching with snippets # res = client.search("search engine", snippet_sizes={'body': 50}) # Searching with complex parameters: q = Query("search engine").verbatim().no_content().paging(0, 5) res = client.search(q)
class RediSearchClient(object): def __init__(self, index_name): self.client = Client(index_name) self.index_name = index_name def build_index(self, line_doc_path, n_docs): line_pool = LineDocPool(line_doc_path) try: self.client.drop_index() except: pass self.client.create_index([TextField('title'), TextField('url'), TextField('body')]) for i, d in enumerate(line_pool.doc_iterator()): self.client.add_document(i, nosave = True, title = d['doctitle'], url = d['url'], body = d['body']) if i + 1 == n_docs: break if i % 1000 == 0: print "{}/{} building index".format(i, n_docs) def search(self, query): q = Query(query).paging(0, 5).verbatim() res = self.client.search(q) # print res.total # "1" return res
def create_website_items_index(): "Creates Index Definition." # CREATE index client = Client(make_key(WEBSITE_ITEM_INDEX), conn=frappe.cache()) try: client.drop_index() # drop if already exists except ResponseError: # will most likely raise a ResponseError if index does not exist # ignore and create index pass except Exception: raise_redisearch_error() idx_def = IndexDefinition([make_key(WEBSITE_ITEM_KEY_PREFIX)]) # Index fields mentioned in e-commerce settings idx_fields = frappe.db.get_single_value("E Commerce Settings", "search_index_fields") idx_fields = idx_fields.split(",") if idx_fields else [] if "web_item_name" in idx_fields: idx_fields.remove("web_item_name") idx_fields = list(map(to_search_field, idx_fields)) client.create_index( [TextField("web_item_name", sortable=True)] + idx_fields, definition=idx_def, ) reindex_all_web_items() define_autocomplete_dictionary()
class SearchDemo: def __init__(self, args): self.index = args.index self.client = Client(self.index, host=args.host, port=args.port) def create(self): try: self.client.drop_index() except: pass self.client.create_index([ NumericField('WORDCOUNT', sortable=True), TextField('BYLINE', no_stem=True, sortable=True), TextField('DOCUMENTTYPE', sortable=True), TextField('HEADLINE', sortable=True), TagField('KEYWORDS', separator=';'), NumericField('MULTIMEDIA', sortable=True), TextField('NEWDESK', sortable=True), NumericField('PRINTPAGE', sortable=True), NumericField('PUBDATE', sortable=True), TextField('SECTIONNAME', sortable=True), TextField('SNIPPET', sortable=True), TextField('TYPEOFMATERIAL', sortable=True), TextField('WEBURL') ])
class TAS_Import(): def __init__(self, index_name, host="172.16.20.7", port=6382, db=0): self.client = Client(index_name, host, port) self.host = host self.port = port self.index_name = index_name self.redis = Redis() def add_indexing(self, schema): self.client.create_index(schema, False, False, []) return ["Done"] def add_data(self, data): for i, rr in enumerate(data): index = i + 1 print rr name, age, location = rr['name'], rr['age'], rr['location'] self.client.add_document(index, NAME=name, AGE=age, LOCATION=location) return ["Done"] def drop_index(self): try: self.client.drop_index() except: pass
def create_website_items_index(): "Creates Index Definition." # CREATE index client = Client(make_key(WEBSITE_ITEM_INDEX), conn=frappe.cache()) # DROP if already exists try: client.drop_index() except Exception: pass idx_def = IndexDefinition([make_key(WEBSITE_ITEM_KEY_PREFIX)]) # Based on e-commerce settings idx_fields = frappe.db.get_single_value('E Commerce Settings', 'search_index_fields') idx_fields = idx_fields.split(',') if idx_fields else [] if 'web_item_name' in idx_fields: idx_fields.remove('web_item_name') idx_fields = list(map(to_search_field, idx_fields)) client.create_index( [TextField("web_item_name", sortable=True)] + idx_fields, definition=idx_def, ) reindex_all_web_items() define_autocomplete_dictionary()
def build_ipa_index(): start_time = time.time() rc = redis.Redis(password=os.environ.get('REDIS_PASSWORD', '')) rs_client = Client('IPAIndex', conn=rc) print( 'Getting file `amministrazioni.txt` from https://www.indicepa.gov.it', flush=True) ipa_index_amm_url = 'https://www.indicepa.gov.it/public-services/opendata-read-service.php?dstype=FS&filename=amministrazioni.txt' ipa_index_amm = pd.read_csv(ipa_index_amm_url, sep='\t', dtype=str) print('Getting file `ou.txt` from https://www.indicepa.gov.it', flush=True) ipa_index_ou_url = 'https://www.indicepa.gov.it/public-services/opendata-read-service.php?dstype=FS&filename=ou.txt' ipa_index_ou = pd.read_csv(ipa_index_ou_url, sep='\t', na_values=['da_indicare', '*****@*****.**'], dtype=str) ipa_index_ou = ipa_index_ou.loc[lambda ipa_index_ou: ipa_index_ou['cod_ou'] == 'Ufficio_Transizione_Digitale'] try: rs_client.drop_index() except: pass # Index already dropped rs_client.create_index([ TextField('ipa_code', weight=2.0), TextField('name', weight=2.0, sortable=True), TextField('site'), TextField('pec'), TextField('city', weight=1.4), TextField('county'), TextField('region'), TagField('type'), TextField('rtd_name'), TextField('rtd_pec'), TextField('rtd_mail'), ]) print('Created index `IPAIndex`', flush=True) print('Feeding `IPAIndex` with data from `amministrazioni.txt`', flush=True) for index, row in ipa_index_amm.iterrows(): rs_client.add_document(row['cod_amm'], language='italian', replace=True, **get_ipa_amm_item(row)) print('Feeding `IPAIndex` with data from `ou.txt`', flush=True) for index, row in ipa_index_ou.iterrows(): rs_client.add_document(row['cod_amm'], partial=True, **get_ipa_rtd_item(row)) finish_time = time.time() print('`IPAIndex` build completed in {0} seconds'.format( round(finish_time - start_time, 2)), flush=True)
class TAS_Import(): def __init__(self, index_name, host=ip, port=port, db=db): self.client = Client(index_name, host, port) self.host = host self.port = port #self.redis = Redis() def add_indexing_schema(self, schema): self.client.create_index(schema, False, False, []) return ["Done"] def add_data(self, rdata, company, doc_id, project): for i, rr in enumerate(rdata): index = doc_id + company + "CMDIC" + str(i + 1) + project l1, l2, l3 = rr l1 = config_obj.StringEscape(l1) self.client.add_document(index, DATA=l1, PAGE=l2, BBOX=l3) return ["Done"] def drop_index(self): try: self.client.drop_index() except Exception as e: #print 'Error',e pass def start(self, data, doc_id, company, project): status = 1 index_name = project + "_DOCUMENT_" + str(doc_id) self.drop_index() self.client = Client(index_name, self.host, self.port) status = 2 schema = [ NumericField('INDEX'), TextField('DATA'), TextField('PAGE'), TextField('BBOX') ] status = 3 self.add_indexing_schema(schema) status = 4 self.add_data(data, company, doc_id, project) status = 5 return [status]
class SearchDemo: def __init__(self, args): self.index = args.index self.client = Client(self.index, host=args.host, port=args.port) def create(self): try: self.client.drop_index() except: pass self.client.create_index([ NumericField('ORDERNUMBER'), NumericField('QUANTITYORDERED', sortable=True), NumericField('PRICEEACH', sortable=True), NumericField('ORDERLINENUMBER'), NumericField('SALES', sortable=True), TextField('ORDERDATE'), TextField('STATUS', sortable=True), NumericField('QTR_ID', sortable=True), NumericField('MONTH_ID', sortable=True), NumericField('YEAR_ID', sortable=True), TextField('PRODUCTLINE', sortable=True), NumericField('MSRP', sortable=True), TextField('PRODUCTCODE', sortable=True), TextField('CUSTOMERNAME', sortable=True), TextField('PHONE'), TextField('ADDRESSLINE1'), TextField('ADDRESSLINE2'), TextField('CITY', sortable=True), TextField('STATE', sortable=True), TextField('POSTALCODE', sortable=True), TextField('COUNTRY', sortable=True), TextField('TERRITORY', sortable=True), TextField('CONTACTLASTNAME'), TextField('CONTACTFIRSTNAME'), TextField('DEALSIZE', sortable=True) ])
def refresh_search_keys(request): if (request.user.is_authenticated() and request.user.is_staff): client = Client('productIndex') total_old_docts = client.info()['num_docs'] delete_status = client.drop_index() new_index = False if delete_status == 'OK': new_index = create_product_search_index() auto_completer = AutoCompleter('productAutocompleter') auto_completer_old_count = auto_completer.len() create_product_autocompleter() auto_completer_new_count = auto_completer.len() return JsonResponse({'success': True}) else: return JsonResponse({'success': False})
def open_redis(): if not os.path.isdir('./nvd_data_feeds/'): os.mkdir('./nvd_data_feeds/') print('Creating the docker container with redislabs/redisearch\n') Popen([ 'docker', 'run', '--rm', '--name', 'amadeus', '-p', '6379:6379', 'redislabs/redisearch:latest' ]) sleep(6) urls = [ 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2021.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2020.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2019.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2018.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2017.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2016.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2015.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2014.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2013.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2012.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2011.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2010.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2009.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2008.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2007.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2006.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2005.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2004.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2003.json.zip', 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2002.json.zip', 'https://nvd.nist.gov/feeds/json/cpematch/1.0/nvdcpematch-1.0.json.zip' ] print('\nDownloading and unziping json feeds') if not os.path.isdir('./downloads/'): os.mkdir('./downloads/') tam = len(urls) dl = 0 for url in urls: name = url.split('/')[-1] response = get(url) open('./downloads/' + name, 'wb').write(response.content) with ZipFile('./downloads/' + name, 'r') as zip_ref: zip_ref.extractall('./nvd_data_feeds/') dl += 1 prog = dl / tam done = int(50 * prog) stdout.write('\r[%s%s%s]%s' % ('Progres > ', '=' * (done - 1) + '>', ' ' * (50 - done), str(round(prog * 100)) + '%')) rmtree('./downloads/') print('\n') print('Start processing CVE feeds') # Create a normal redis connection conn = redis.Redis('localhost') # Creating a client with a given index name client = Client('cveIndex') # IndexDefinition is avaliable for RediSearch 2.0+ definition = IndexDefinition(prefix=['cve:']) # Creating the index definition and schema try: client.create_index((TextField('id'), TextField('description'), TextField('configurations')), definition=definition) except: # Index already exists. Delete and recreate client.drop_index() print('Index already exists\nDropping\nDelete keys and try again') exit() def process_CVE_file(file): with open(file, 'r', encoding='utf8') as f: json = ujson.decode(f.read()) cve_items = json['CVE_Items'] for cve_item in cve_items: cve_id = cve_item['cve']['CVE_data_meta']['ID'] cve_desc = cve_item['cve']['description']['description_data'][ 0]['value'] cve_configurations = str(cve_item['configurations']['nodes']) # Sanitizing special characters to prevent them from being tokenized away cve_desc_sanitized = cve_desc.replace(':', 'cc11').replace( '.', 'pp22').replace('*', 'ss33') cve_configurations_sanitized = cve_configurations.replace( ':', 'cc11').replace('.', 'pp22').replace('*', 'ss33') # Indexing a document for RediSearch 2.0+ client.redis.hset('cve:' + cve_id, mapping={ 'id': cve_id, 'description': cve_desc_sanitized, 'configurations': cve_configurations_sanitized }) print('Processed ' + file) with ThreadPoolExecutor(max_workers=20) as pool: futures = [] for i in range(2002, 2021): future = pool.submit( process_CVE_file, './nvd_data_feeds/nvdcve-1.1-{0}.json'.format(i)) futures.append(future) json_list = [x.result() for x in as_completed(futures)] print('Done processing CVE feeds\nProcessing NVD CPE match feed') with open('./nvd_data_feeds/nvdcpematch-1.0.json', 'r', encoding='utf8') as f: json = ujson.decode(f.read()) matches = json['matches'] for match in matches: rootUri = match['cpe23Uri'] keyName = rootUri if 'versionStartIncluding' in match: keyName += ';;versionStartIncluding=' + match[ 'versionStartIncluding'] if 'versionStartExcluding' in match: keyName += ';;versionStartExcluding=' + match[ 'versionStartExcluding'] if 'versionEndIncluding' in match: keyName += ';;versionEndIncluding=' + match[ 'versionEndIncluding'] if 'versionEndExcluding' in match: keyName += ';;versionEndExcluding=' + match[ 'versionEndExcluding'] if len(match['cpe_name']) > 0: # if CPE list is empty no need to include it in cache valueString = ';;'.join(x['cpe23Uri'] for x in match['cpe_name']) conn.set(keyName, valueString) print('\nAMADEUS is already launched!')
def delete_index(cls): client = Client("tower", port=6379, host=os.getenv('REDIS_HOST')) client.drop_index()
try: client.info() except Exception as e: if e.args[0] != "Unknown Index name": print("You must be running a redis server with the redisearch module installed") exit() # IndexDefinition is avaliable for RediSearch 2.0+ definition = IndexDefinition(prefix=['cve:']) # Creating the index definition and schema try: client.create_index((TextField("id"), TextField("description"), TextField("configurations")), definition=definition) except: # Index already exists. Delete and recreate client.drop_index() print("Index already exists. Dropping. Delete keys and try again.") exit() def process_CVE_file(file): with open(file, 'r', encoding="utf8") as f: json = ujson.decode(f.read()) cve_items = json['CVE_Items'] for cve_item in cve_items: cve_id = cve_item['cve']['CVE_data_meta']['ID'] cve_desc = cve_item['cve']['description']['description_data'][0]['value'] cve_configurations = str(cve_item['configurations']['nodes']) # Sanitizing special characters to prevent them from being tokenized away cve_desc_sanitized = cve_desc.replace(':','cc11').replace('.','pp22').replace('*','ss33') cve_configurations_sanitized = cve_configurations.replace(':','cc11').replace('.','pp22').replace('*','ss33') # Indexing a document for RediSearch 2.0+
class Indexer: """ Scrapes repos found on awesome lists. Inserts repo data into Redis. **urls**: List of Github awsome lists URLs. """ def __init__(self, urls: List[str], max_per_list: int = MAX_RES_PER_LIST): self.urls = urls self.client = Client(INDEX_NAME, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) self.keys = Keys(KEY_PREFIX) self.max = max_per_list def create_index_definition(self, drop_existing=False): """ Create an index definition. Do nothing if it already exists. """ if drop_existing: self.client.drop_index() definition = IndexDefinition(prefix=[self.keys.pre("resource:")]) try: self.client.create_index([ TextField('body', weight=1), TextField('repo_name', weight=1.5), TextField('language', weight=1), TextField('lists') ], definition=definition) except ResponseError: print("Index already exists.") def index(self): """ Insert scraped resources into Redis. """ for url in self.urls: parent = RepoScraper(url) print(f"Creating index for {parent.repo}") self.client.redis.sadd(self.keys.awesome_list_list(), parent.repo) resources = AwesomeScrape(url).scrape(max_num=self.max) # Create set of all awesome lists a repo appears on # Required to set tag feild for resource in resources: try: self.client.redis.sadd( self.keys.github_repo_lists(resource['owner']['login'], resource['name']), parent.repo) except (KeyError, DataError): pass for resource in resources: try: if resource['language'] is not None: language = resource['language'] self.client.redis.sadd(self.keys.language_list(), language) else: language = '' lists = self.client.redis.smembers( self.keys.github_repo_lists(resource['owner']['login'], resource['name'])) self.client.redis.hset(self.keys.github_repo( resource['owner']['login'], resource['name']), mapping={ 'repo_name': resource['name'], 'lists': ", ".join(lists), 'body': resource['description'], 'stargazers_count': resource['stargazers_count'], 'language': language, 'svn_url': resource['svn_url'] }) except (KeyError, DataError): print(f"Resource missing data: f{resource}")
class TAS_Redisearch(): #Constructor def __init__(self, table_name, host="localhost", port=6381): try: self.client = Client(table_name, host, port) self.host = host self.port = port self.table_name = table_name self.redis = Redis() self.LIMIT = 10 except Exception as e: print 'yyy' print >> sys.stderr, "TAS_Redisearch Error inside Constructor Index:\'", table_name, "\' HOST:\'", host, "\' PORT:\'", port, "\'\n" print >> sys.stderr, e #Will set the no of results to show def set_result_limit(self, num): self.LIMIT = num return #Defines the schema for Redisearch def set_schema(self, schema): try: return self.client.create_index( schema, False, False, [] ) #last empty list will ensure that default stopwords will not be ignored except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside set_schema Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e #Deletes index(table) def drop_index(self): try: return self.client.drop_index() except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside drop_index Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e #Deletes a document(row) by document_index def delete_document(self, document_index): try: return self.client.delete_document(document_index) except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside delete_document Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e #############################################SEARCHES BELOW####################################### #Uses python libraries def py_search(self, query, result_limit=-1): if result_limit == -1: result_limit = self.LIMIT try: return self.client.search(Query(query).paging(0, result_limit)) except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside py_search Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print sys.stderr, e #Search with default parameters [will return dictionary] def generic_search(self, search_text, result_limit=-1): if result_limit == -1: result_limit = self.LIMIT query_string = "FT.SEARCH " + self.table_name + " " + search_text + " LIMIT 0 " + str( result_limit) try: res = self.redis.execute_command(query_string) return Result(res, True) except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside generic_search Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e def free_exact_search(self, key, result_limit=-1): org_key = key l = [] try: if result_limit == -1: result_limit = self.LIMIT key = self.clean_string(key) returned = self.py_search("*", result_limit) for result in returned.docs: result_dict = vars(result) if org_key in result_dict.values(): l.append(result_dict) except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside value_search Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e return l #{fieldname:[value1, value2], fieldname:[value1, value2]} def exact_search(self, input_dict, result_limit=-1): formed_str = "" l = [] for field, value_list in input_dict.items(): formed_str += "@" + field + ":(" for key in value_list: key = self.clean_string(key) formed_str += "(\'" + key + "\') | " formed_str = formed_str.rstrip(' |') formed_str += ") " print "PASSED: ", formed_str returned = self.py_search(formed_str, result_limit) print "RETURNED:", returned for result in returned.docs: result_dict = vars(result) for itr, ktr in input_dict.items(): if result_dict[itr] in ktr: l.append(result_dict) return l #Search with the passed query def custom_search(self, query_string): try: res = self.redis.execute_command(query_string) return Result(res, True) except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside custom_search Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e #Search in 'search_in_field' [if any of the element in 'list_to_union' is found then include it in the result def union_search(self, list_to_union, search_in_field): query_string = "FT.SEARCH " + self.table_name + " " union_text = "@" + search_in_field + ":(" for text in list_to_union: union_text += text + "|" union_text = union_text.rstrip("|") union_text += ")" query_string += union_text try: res = self.redis.execute_command(query_string) return Result(res, True) except Exception as e: print >> sys.stderr, "TAS_Redisearch Error inside union_search Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e #will return all the dictionary for all the categories if no arguments are passed def category_taxonomy_dict(self, category='*'): try: cat_taxo_dict = {} total_docs = self.client.info()['num_docs'] query_string = "" if category == '*': query_string = category else: query_string = "@CATEGORY:" + category result = self.py_search(query_string, total_docs) for single_result in result.docs: try: category = single_result.CATEGORY taxoname = single_result.TAXONAME except Exception as ex: pass if not category in cat_taxo_dict: cat_taxo_dict[category] = [] elif taxoname not in cat_taxo_dict[category]: cat_taxo_dict[category].append(taxoname) except Exception as e: sys.stderr, "TAS_Redisearch Error inside category_taxonomy_dict Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e return cat_taxo_dict def total_record(self): try: return int(self.client.info()['num_docs']) except Exception as e: sys.stderr, "TAS_Redisearch Error inside total_records Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e def get_all_records(self): try: total = str(self.total_record()) res = self.redis.execute_command("FT.SEARCH " + self.table_name + " * LIMIT 0 " + total) return Result(res, True) except Exception as e: sys.stderr, "TAS_Redisearch Error inside total_records Index:\'", self.table_name, "\' HOST:\'", self.host, "\' PORT:\'", self.port, "\'\n" print >> sys.stderr, e def clean_string(self, key): key = key.replace(',', ' ') key = key.replace('.', ' ') key = key.replace('<', ' ') key = key.replace('>', ' ') key = key.replace('{', ' ') key = key.replace('}', ' ') key = key.replace('[', ' ') key = key.replace(']', ' ') key = key.replace('"', ' ') key = key.replace('\'', ' ') key = key.replace(':', ' ') key = key.replace(';', ' ') key = key.replace('!', ' ') key = key.replace('@', ' ') key = key.replace('#', ' ') key = key.replace('$', ' ') key = key.replace('%', ' ') key = key.replace('^', ' ') key = key.replace('&', ' ') key = key.replace('*', ' ') key = key.replace('(', ' ') key = key.replace(')', ' ') key = key.replace('-', ' ') key = key.replace('+', ' ') key = key.replace('=', ' ') key = key.replace('~', ' ') return key
class TAS_Import(): def __init__(self, index_name, host="172.16.20.7", port=6382, db=0): self.client = Client(index_name, host, port) self.host = host self.port = port self.config_obj = redis_config.TAS_AutoCompleter( host, port, db, "Default") #self.redis = Redis() def add_indexing_schema(self, schema): self.client.create_index(schema, False, False, []) return ["Done"] def add_data(self, rdata, index_name): for i, rr in enumerate(rdata): #print rr,type(rr[2]) l1, l2, l3, l4, l5, l6, l7, l8, l9 = rr index = index_name + str(i + 1) + l3 + l4 + l5 + l6 #print 'index_name', index_name, index, l3, l4, l5, l6 l1 = self.config_obj.StringEscape(l1) l2 = l2.strip() self.client.add_document(index, DATA=l1, SECTION_TYPE=l2, DOCID=l3, PAGE=l4, GRIDID=l5, ROWCOL=l6, BBOX=l7, PAGE_GRID_SE="%s_%s_%s" % (l4, l5, l2), Rowspan=l8, Colspan=l9) return ["Done"] def drop_index(self): try: self.client.drop_index() except Exception as e: print 'Error', e pass def start(self, data, index_name): status = 1 self.drop_index() self.client = Client(index_name, self.host, self.port) status = 2 schema = [ NumericField('INDEX'), TextField('DATA'), TextField('SECTION_TYPE'), TextField('DOCID'), TextField('PAGE'), TextField('GRIDID'), TextField("ROWCOL"), TextField('BBOX'), TextField("PAGE_GRID_SE"), TextField('Rowspan'), TextField('Colspan') ] #rsObj.set_schema([NumericField('INDEX'), TextField('DOCID'), TextField('CATEGORY'), TextField('TAXONAME'), TextField('VALUE'), TextField('XML_REF'), TextField('REF_KEY')]) status = 3 self.add_indexing_schema(schema) status = 4 self.add_data(data, index_name) status = 5 return [status]