Beispiel #1
0
    def __init__(self,
                 lucene_index_path: str,
                 min_df: int = 1,
                 verbose: bool = False):
        self.min_df: int = min_df
        self.verbose: bool = verbose
        self.index_reader = index.IndexReader(lucene_index_path)
        self.searcher = search.LuceneSearcher(lucene_index_path)
        self.num_docs: int = self.searcher.num_docs
        self.stats = self.index_reader.stats()
        self.analyzer = Analyzer(get_lucene_analyzer())

        # build vocabulary
        self.vocabulary_ = set()
        for term in self.index_reader.terms():
            if term.df > self.min_df:
                self.vocabulary_.add(term.term)
        self.vocabulary_ = sorted(self.vocabulary_)

        # build term to index mapping
        self.term_to_index = {}
        for i, term in enumerate(self.vocabulary_):
            self.term_to_index[term] = i
        self.vocabulary_size = len(self.vocabulary_)

        if self.verbose:
            print(
                f'Found {self.vocabulary_size} terms with min_df={self.min_df}'
            )
Beispiel #2
0
 def __init__(self):
     index_file = "../../indices/sample_collection_jsonl/"
     self.keyword_util = KeywordSearchUtil(index_file)
     self.semantic_util = SemanticSearchUtil()
     self.hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                   '/usr/share/hunspell/en_US.aff')
     self.index_reader = index.IndexReader(index_file)
Beispiel #3
0
    def avICTF(self,query):
        index_reader = index.IndexReader(self.ext_index)
        ql=len(query.split())
        sub_result=1
        for term in query.split():
            try:
                df, collection_freq = index_reader.get_term_counts(ps.stem(term.lower()))
            except:
                collection_freq=1
                df=1

            if isinstance(collection_freq,int)==False:
                collection_freq=1
                df=1

            try:
                sub_result= sub_result * (self.ext_collection_tokens / collection_freq)
            except:
                sub_result= sub_result * self.ext_collection_tokens
        sub_result=math.log2(sub_result)
        externalavICTF= (sub_result/ql)
        index_reader = index.IndexReader(self.index)
        sub_result=1
        for term in query.split():
            try:
                df, collection_freq = index_reader.get_term_counts(ps.stem(term.lower()))
            except:
                collection_freq=1
                df=1
            if  isinstance(collection_freq,int)==False:
                df=1
                collection_freq=1
            try:
                sub_result= sub_result * (self.ext_collection_tokens / collection_freq)
            except:
                sub_result= sub_result * self.ext_collection_tokens
        sub_result=math.log2(sub_result)
        internalavICTF = (sub_result/ql)
        if internalavICTF < 10 and externalavICTF < 10:
            return "NoExpansionPreferred"
        elif internalavICTF >= externalavICTF:
            return "InternalExpansionPreferred"
        elif externalavICTF > internalavICTF:
            return "ExternalExpansionPreferred"
Beispiel #4
0
    def setUp(self):
        # Download pre-built CACM index; append a random value to avoid filename clashes.
        r = randint(0, 10000000)
        self.collection_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.tar.gz'
        self.tarball_name = 'lucene-index.cacm-{}.tar.gz'.format(r)
        self.index_dir = 'index{}/'.format(r)

        _, _ = urlretrieve(self.collection_url, self.tarball_name)

        tarball = tarfile.open(self.tarball_name)
        tarball.extractall(self.index_dir)
        tarball.close()

        self.index_path = os.path.join(self.index_dir, 'lucene-index.cacm')
        self.searcher = search.SimpleSearcher(self.index_path)
        self.index_reader = index.IndexReader(self.index_path)
def write_out(path_index, path_out, query_operation):
  
  searcher = SimpleSearcher(path_index)
  index_utils = index.IndexReader(path_index)

  f = open(path_out, "a")
  searcher.set_bm25(0.9, 0.4)
  searcher.set_rm3(10, 10, 0.5)
  searcher.set_qld(400)
  for x in range(len(number)):
    hits = searcher.search(query_operation[x],100)

    # Print the first 10 hits:
    for i in range(0, 100):
      #f = open("/content/anserini/june_remove_num.txt", "a")
      print(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}')
      f.write(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}\n')
  f.close()
Beispiel #6
0
import os
from timeit import default_timer as timer

from fastapi import FastAPI
from fastapi import HTTPException
from pyserini import index
from pyserini.search import SimpleSearcher

INDEX_PATH = os.environ['ANSERINI_INDEXI_PATH']

# Initialize pyserini searcher and index reader
searcher = SimpleSearcher(INDEX_PATH)
index_reader = index.IndexReader(INDEX_PATH)

# Configure BM25 parameters
searcher.set_bm25(0.9, 0.4)

app = FastAPI()


@app.get("/")
def read_root():
    return {"index_path": INDEX_PATH}


@app.get("/search/")
def search(query: str, size: int = 100):
    start = timer()
    hits = searcher.search(query, k=size)
    end = timer()
    total_time = end - start
Beispiel #7
0
                        default=30,
                        type=int,
                        help='Number of tfidf terms to extract')
    parser.add_argument('--cut',
                        dest='cut',
                        default=9999999,
                        type=int,
                        help='Cut off used to build smaller sample db.')
    args = parser.parse_args()

    # Check if database exists if not, create it:
    if not os.path.exists(f'../resources/db/{args.name}.db'):
        create_db(args.name)

    # Index Utility
    index_utils = index.IndexReader(f'../resources/Index/{args.index}')
    total_docs = index_utils.stats()['non_empty_documents']

    # Docids
    all_docids = get_docids(args.topics, args.candidates, args.topics_only)

    # Connect Database
    conn, cursor = db_utils.connect_db(f'../resources/db/{args.name}.db')

    # Loop over docids:
    for docid in tqdm(all_docids[:args.cut]):

        # Extract all paragraphs from doc and store in list.
        contents = index_utils.doc_contents(docid).split('\n')

        # Obtain top n tfidf terms in doc
print(f"Results are stored in resources/output/runs/{args.output}\n")
utils.create_new_file_for_sure(f"resources/output/{args.output}")

# '../database_utils/db/rel_entity_reader.db'
conn, cursor = db_utils.connect_db(f"resources/db/{args.db}")

# load word embeddings
if args.term_embedding > 0 and args.embedding != "":
    embeddings = utils.load_word_vectors(
        f"resources/embeddings/{args.embedding}")
    print("Embeddings sucessfully loaded!")
else:
    embeddings = {}

# Load index
index_utils = index.IndexReader(f"resources/Index/{args.index}")

# Configure graph options.
comparator = GMCSComparator()

# Build kwargs for graph initialization:
build_arguments = {
    "index_utils": index_utils,
    "cursor": cursor,
    "embeddings": embeddings,
    "use_entities": args.use_entities,
    "nr_terms": args.nr_terms,
    "term_tfidf": args.term_tfidf,
    "term_position": args.term_position,
    "text_distance": args.text_distance,
    "term_embedding": args.term_embedding,
#!pip install rake-nltk
#!pip install pytextrank
#!pip install bert-extractive-summarizer
#!pip install neuralcoref
#!pip install transformers==2.2.2
!python -c "import nltk; nltk.download('stopwords')"
from pyserini.search import SimpleSearcher
from pyserini import analysis, index
from rake_nltk import Rake
import pytextrank
from summarizer import Summarizer
import spacy
#path to index document (output from Anserini indexing)
path_index = "/content/indexes/lucene-index.core18.pos+docvectors+raw/"
searcher = SimpleSearcher(path_index)
index_utils = index.IndexReader(path_index)

nlp = spacy.load('en_core_web_sm')

ner_out = []
rake_out = []
textrank_out = []
bert_out = []
#add textrank into spacy pipeline
textrank_pipe = pytextrank.TextRank()
textrank_pipe.load_stopwords(path="stop.json")
nlp.add_pipe(textrank_pipe.PipelineComponent, name="textrank", last=True)

def write_out(path_index, path_out, query_operation):
  
  searcher = SimpleSearcher(path_index)