def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = False): self.min_df: int = min_df self.verbose: bool = verbose self.index_reader = index.IndexReader(lucene_index_path) self.searcher = search.LuceneSearcher(lucene_index_path) self.num_docs: int = self.searcher.num_docs self.stats = self.index_reader.stats() self.analyzer = Analyzer(get_lucene_analyzer()) # build vocabulary self.vocabulary_ = set() for term in self.index_reader.terms(): if term.df > self.min_df: self.vocabulary_.add(term.term) self.vocabulary_ = sorted(self.vocabulary_) # build term to index mapping self.term_to_index = {} for i, term in enumerate(self.vocabulary_): self.term_to_index[term] = i self.vocabulary_size = len(self.vocabulary_) if self.verbose: print( f'Found {self.vocabulary_size} terms with min_df={self.min_df}' )
def __init__(self): index_file = "../../indices/sample_collection_jsonl/" self.keyword_util = KeywordSearchUtil(index_file) self.semantic_util = SemanticSearchUtil() self.hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') self.index_reader = index.IndexReader(index_file)
def avICTF(self,query): index_reader = index.IndexReader(self.ext_index) ql=len(query.split()) sub_result=1 for term in query.split(): try: df, collection_freq = index_reader.get_term_counts(ps.stem(term.lower())) except: collection_freq=1 df=1 if isinstance(collection_freq,int)==False: collection_freq=1 df=1 try: sub_result= sub_result * (self.ext_collection_tokens / collection_freq) except: sub_result= sub_result * self.ext_collection_tokens sub_result=math.log2(sub_result) externalavICTF= (sub_result/ql) index_reader = index.IndexReader(self.index) sub_result=1 for term in query.split(): try: df, collection_freq = index_reader.get_term_counts(ps.stem(term.lower())) except: collection_freq=1 df=1 if isinstance(collection_freq,int)==False: df=1 collection_freq=1 try: sub_result= sub_result * (self.ext_collection_tokens / collection_freq) except: sub_result= sub_result * self.ext_collection_tokens sub_result=math.log2(sub_result) internalavICTF = (sub_result/ql) if internalavICTF < 10 and externalavICTF < 10: return "NoExpansionPreferred" elif internalavICTF >= externalavICTF: return "InternalExpansionPreferred" elif externalavICTF > internalavICTF: return "ExternalExpansionPreferred"
def setUp(self): # Download pre-built CACM index; append a random value to avoid filename clashes. r = randint(0, 10000000) self.collection_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.tar.gz' self.tarball_name = 'lucene-index.cacm-{}.tar.gz'.format(r) self.index_dir = 'index{}/'.format(r) _, _ = urlretrieve(self.collection_url, self.tarball_name) tarball = tarfile.open(self.tarball_name) tarball.extractall(self.index_dir) tarball.close() self.index_path = os.path.join(self.index_dir, 'lucene-index.cacm') self.searcher = search.SimpleSearcher(self.index_path) self.index_reader = index.IndexReader(self.index_path)
def write_out(path_index, path_out, query_operation): searcher = SimpleSearcher(path_index) index_utils = index.IndexReader(path_index) f = open(path_out, "a") searcher.set_bm25(0.9, 0.4) searcher.set_rm3(10, 10, 0.5) searcher.set_qld(400) for x in range(len(number)): hits = searcher.search(query_operation[x],100) # Print the first 10 hits: for i in range(0, 100): #f = open("/content/anserini/june_remove_num.txt", "a") print(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}') f.write(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}\n') f.close()
import os from timeit import default_timer as timer from fastapi import FastAPI from fastapi import HTTPException from pyserini import index from pyserini.search import SimpleSearcher INDEX_PATH = os.environ['ANSERINI_INDEXI_PATH'] # Initialize pyserini searcher and index reader searcher = SimpleSearcher(INDEX_PATH) index_reader = index.IndexReader(INDEX_PATH) # Configure BM25 parameters searcher.set_bm25(0.9, 0.4) app = FastAPI() @app.get("/") def read_root(): return {"index_path": INDEX_PATH} @app.get("/search/") def search(query: str, size: int = 100): start = timer() hits = searcher.search(query, k=size) end = timer() total_time = end - start
default=30, type=int, help='Number of tfidf terms to extract') parser.add_argument('--cut', dest='cut', default=9999999, type=int, help='Cut off used to build smaller sample db.') args = parser.parse_args() # Check if database exists if not, create it: if not os.path.exists(f'../resources/db/{args.name}.db'): create_db(args.name) # Index Utility index_utils = index.IndexReader(f'../resources/Index/{args.index}') total_docs = index_utils.stats()['non_empty_documents'] # Docids all_docids = get_docids(args.topics, args.candidates, args.topics_only) # Connect Database conn, cursor = db_utils.connect_db(f'../resources/db/{args.name}.db') # Loop over docids: for docid in tqdm(all_docids[:args.cut]): # Extract all paragraphs from doc and store in list. contents = index_utils.doc_contents(docid).split('\n') # Obtain top n tfidf terms in doc
print(f"Results are stored in resources/output/runs/{args.output}\n") utils.create_new_file_for_sure(f"resources/output/{args.output}") # '../database_utils/db/rel_entity_reader.db' conn, cursor = db_utils.connect_db(f"resources/db/{args.db}") # load word embeddings if args.term_embedding > 0 and args.embedding != "": embeddings = utils.load_word_vectors( f"resources/embeddings/{args.embedding}") print("Embeddings sucessfully loaded!") else: embeddings = {} # Load index index_utils = index.IndexReader(f"resources/Index/{args.index}") # Configure graph options. comparator = GMCSComparator() # Build kwargs for graph initialization: build_arguments = { "index_utils": index_utils, "cursor": cursor, "embeddings": embeddings, "use_entities": args.use_entities, "nr_terms": args.nr_terms, "term_tfidf": args.term_tfidf, "term_position": args.term_position, "text_distance": args.text_distance, "term_embedding": args.term_embedding,
#!pip install rake-nltk #!pip install pytextrank #!pip install bert-extractive-summarizer #!pip install neuralcoref #!pip install transformers==2.2.2 !python -c "import nltk; nltk.download('stopwords')" from pyserini.search import SimpleSearcher from pyserini import analysis, index from rake_nltk import Rake import pytextrank from summarizer import Summarizer import spacy #path to index document (output from Anserini indexing) path_index = "/content/indexes/lucene-index.core18.pos+docvectors+raw/" searcher = SimpleSearcher(path_index) index_utils = index.IndexReader(path_index) nlp = spacy.load('en_core_web_sm') ner_out = [] rake_out = [] textrank_out = [] bert_out = [] #add textrank into spacy pipeline textrank_pipe = pytextrank.TextRank() textrank_pipe.load_stopwords(path="stop.json") nlp.add_pipe(textrank_pipe.PipelineComponent, name="textrank", last=True) def write_out(path_index, path_out, query_operation): searcher = SimpleSearcher(path_index)