def test_escape(self): self.assertEqual(pyndri.escape('hello (world)'), 'hello world') self.assertEqual(pyndri.escape('hello.world'), 'hello world') self.assertEqual(pyndri.escape('hello:world'), 'hello world')
def test_tokenize(self): self.assertEqual(pyndri.tokenize('hello world foo bar'), ('hello', 'world', 'foo', 'bar')) self.assertEqual(pyndri.tokenize('hello-world'), ('hello', 'world')) self.assertEqual(pyndri.tokenize('hello.world'), ('hello', )) self.assertEqual(pyndri.tokenize(pyndri.escape('hello.world')), ('hello', 'world')) self.assertEqual(pyndri.tokenize('hello "world"'), ( 'hello', 'world', )) self.assertRaises(OSError, lambda: pyndri.tokenize('hello (world)')) self.assertEqual(pyndri.tokenize(pyndri.escape('hello \'world\'')), ( 'hello', 'world', )) self.assertEqual(pyndri.tokenize(pyndri.escape('hello/world')), ( 'hello', 'world', ))
def search(): index, dictionary = get_index() query_string = request.args.get('q', None) smoothing_method = request.args.get('smoothing_method', 'dirichlet') smoothing_param = float(request.args.get('smoothing_param', 1000)) results_requested = int(request.args.get('results_requested', 10)) documents = [] if query_string is not None: logging.info('Query string: %s', query_string) highlighted_token_ids = set() if not query_string.startswith('docid:'): for token in index.tokenize(pyndri.escape(query_string)): if dictionary.has_token(token): highlighted_token_ids.add( dictionary.translate_token(token)) def _include_document(int_doc_id): ext_doc_id, doc_token_ids = index.document(int_doc_id) def _format_token(token_id): term = dictionary[token_id] if token_id in highlighted_token_ids: term = '<strong>{}</strong>'.format(term) return term doc_tokens = [ _format_token(token_id) if token_id > 0 else '<unk>' for token_id in doc_token_ids ] documents.append((ext_doc_id, ' '.join(doc_tokens))) if query_string.startswith('docid:'): ext_document_id = query_string[6:] lookup = dict(index.document_ids([ext_document_id])) if lookup: _include_document(lookup[ext_document_id]) else: query_env = pyndri.QueryEnvironment(index, rules=(build_smoothing_rule( smoothing_method, smoothing_param), )) results = query_env.query(query_string, results_requested=results_requested) for int_doc_id, _ in results: _include_document(int_doc_id) return render_template('index.html', query=query_string, results=documents, smoothing_method=smoothing_method, smoothing_param=smoothing_param)
outputFile = open(join(args["<outputfolder>"],"RetrievalParameterFile_{name}.xml".format(name=args["<collection_name>"])), 'w') outputFile.write("<parameters>\n") tokenizer=MosesTokenizer() prog = re.compile("[_\-\(]*([A-Z]\.)*[_\-\(]*") tops = {} for top in topics: terms=topics[top].split() toptext="" for t in terms: if (prog.match(t)): t=t.replace('.','') toptext=toptext+" "+t toptext=escape(toptext) tops[top]=tokenizer.tokenize(toptext,return_str=True) topics = collections.OrderedDict(sorted(tops.items())) for t in topics : print("topic : {t}".format(t=t)) outputFile.write(" <query>\n <type>indri</type>\n") outputFile.write(" <number>{num}</number>\n".format(num=int(t))) outputFile.write(" <text>\n") outputFile.write(" {txt}\n".format(txt=topics[t])) outputFile.write(" </text>\n") outputFile.write(" </query>\n") outputFile.write("</parameters>") print("\nEnded.")